def download_linke(coords, proxy, port, saveFile, saveMode):

    print proxy, port
    print proxy != ""

    url = "http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month"

    session = Session()
    session.verify = False

    if proxy != "":
        proxies = {proxy: port}
        session.proxies = proxies

    br = RoboBrowser(session=session, parser="lxml")
    br.open(url)

    linke_form = br.get_forms()[1]

    num = len(coords)
    index = 0

    with open(saveFile, saveMode) as f:
        try:
            for coord in coords:
                inlon, inlat = coord
                linke_form["lat"].value = inlat
                linke_form["lon"].value = inlon

                sf = linke_form.submit_fields.getlist("execute")
                br.submit_form(linke_form, submit=sf[0])

                linke_table = br.find("table", {"cellspacing": "0", "cellpadding": "2"})

                linkes = get_monthly_linke_str(get_linke_values(linke_table))
                s = "%s,%s,%s\n" % (format(inlon, "0.5f"), format(inlat, "0.5f"), linkes)

                if len(s) > 48:
                    f.write(s)
                    print "Done with point %i of %i: (%s, %s)" % (
                        index + 1,
                        num,
                        format(inlon, "0.5f"),
                        format(inlat, "0.5f"),
                    )

                index += 1

                br.back()

            print "DONE!"

        except Exception as e:

            not_dl = list(coords[index:])
            with open(saveFile + "_notdownloaded.txt", "w") as nd:
                for c in not_dl:
                    nd.write("%s,%s\n" % (str(c[0]), str(c[1])))
            print e
def get_video_url(url):

    br = RoboBrowser(history=True, parser='lxml')
    br.open(url)

    cn = input('请问是否要转换为中文?(y/n)')
    if not cn:
        cn = 'y'
    if cn == 'y':
        # shift to simplified chinese
        lang = br.get_forms()[0]
        lang['session_language'].options = ['cn_CN']
        lang['session_language'].value = 'cn_CN'
        br.submit_form(lang)

    # get video title
    vid_title = br.find('div', {'id': 'viewvideo-title'}).text.strip()
    print('the video you want to download is: {0}'.format(vid_title))
    print('-----------------------------------------------------------')

    # get video id
    vid_id = re.findall(
        r'\d{6}',
        br.find('a', {
            'href': '#featureVideo'
        }).attrs['onclick'])[0]

    # get real video link
    vid_real_url = 'http://192.240.120.34//mp43/{}.mp4'.format(vid_id)
    return vid_real_url, re.sub(
        """[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。|?、~@#¥%……&*():]+""", " ",
        vid_title).strip()
예제 #3
0
    def test_calc_interface(self):
        operation = "5,+,2"
        expected_result = 7

        # Add some result to DB
        requests.post('/'.join((TEST_URL, 'calc')),
            data={'operation':'998,-,888'})

        # Init object
        browser = RoboBrowser(history=True, parser='html.parser')
        browser.open(TEST_URL)

        # Fill calc form
        calc_form = browser.get_form(action='/calc')
        calc_form['operation'] = operation
        browser.submit_form(calc_form)

        # Get result
        result_raw = browser.find(id="result").text
        self.assertEqual(int(result_raw), expected_result)

        # Check result link
        browser.follow_link(browser.find(id='result_link'))
        self.assertEqual((operation, expected_result),
            (browser.find(id="operation").text, int(browser.find(id="result").text)))
예제 #4
0
파일: pcomix.py 프로젝트: Selezar211/sdfsdf
def main():
    url = 'http://www.porncomix.info/milky-milk-2-dragon-ball-z-english/'
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    browser.open(url)

    wrapper = browser.find('div', {'id': 'gallery-1'})
    imgs = wrapper.find_all('a', href=True)

    img_list = []

    for line in imgs:
        img_list.append(line['href'])

    name = 1
    for line in img_list:
        browser.open(line)
        wrapper_div = browser.find('div', {'class': 'attachment-image'})
        my_img = wrapper_div.find('img', src=True)

        img_data = requests.get(my_img['src']).content
        with open(str(name) + '.jpg', 'wb') as handler:
            handler.write(img_data)

        name += 1

        with open('Walao.txt', 'a') as f:
            f.write(my_img['src'] + '\n')

            print(my_img['src'])
예제 #5
0
def parse_answer_page(page_url):
    ans_browser = RoboBrowser(history=True, user_agent='nemo1')
    ans_browser.open(page_url)
    title = ans_browser.find(class_="zm-item-title").a
    # pdb.set_trace()
    title_text = title.get_text()
    print title_text.encode('utf-8')
    title_id = title["href"].split("/")[-1]
    directory = "/Users/nemo/Pictures/zhihu/" + title_text
    if not os.path.exists(directory):
        os.makedirs(directory)
        with open(directory + "/url_record.txt", "a") as output:
            output.write(page_url + "\n")

    content_div = ans_browser.find("div",
                                   class_="zm-editable-content clearfix")
    count = 0
    for img_tag in content_div.find_all("img"):
        count = count + 1
        try:
            img_src_url = img_tag["data-original"]
            print count, img_src_url
        except:
            pdb.set_trace()
            print "No data original " + img_tag
            continue
예제 #6
0
class Answer(object):
    """ Zhihu parser, answer obj"""
    def __init__(self, page_url):
        self.url = page_url
        self.ans_browser = RoboBrowser(history=True,user_agent='nemo1')
        self.ans_browser.open(self.url)

    def get_related_question_url(self):
        h2_tag = self.ans_browser.find("h2", class_="zm-item-title")
        return "https://www.zhihu.com" + h2_tag.a["href"]

    def get_related_question_title(self):
        h2_tag = self.ans_browser.find("h2", class_="zm-item-title")
        return h2_tag.a.get_text()


    def get_img_url_list(self):
        content_div = self.ans_browser.find("div", class_="zm-editable-content clearfix")
        results = []
        for img_tag in content_div.find_all("img"):
            if "data-original" in img_tag:
                results.append(img_tag["data-original"])
            elif "src" in img_tag:
                results.append(img_tag["src"])
        return results

    def get_thumbs_up_count(self): 
        div = self.ans_browser.find("div", class_="zm-item-vote-info")
        if div:
            return int(div["data-votecount"])
예제 #7
0
def run(wait):
    """Starts the scrapping proccess.
    Opens a teamstats page and gathers all the form inputs
    Then sends these inputs to parseSeason which opens a new page for every possible option in the form
    If you get an error at the start, with role.find_all, just try again, nfl.com returns weird pages sometimes
    """

    logger = makeLogger('main', r'./logs_nflteamStat/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
    logger.debug('waiting %d seconds', wait)
    time.sleep(wait)

    pool = Pool(processes=int(get_proxy_count()/2.5))

    #html5lib parser required for broken html on gameSplits
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    startingUrl = "http://www.nfl.com/stats/categorystats?tabSeq=2&offensiveStatisticCategory=GAME_STATS&conference=ALL&role=TM&season=2015&seasonType=REG"
    browser = open_or_follow_link(logger, browser, 'open', startingUrl)
    
    role = browser.find(id="role")
    roles = role.find_all("option")
    offensiveCategory = browser.find(id="offensive-category")
    offensiveCategories = offensiveCategory.find_all("option")
    defensiveCategory = browser.find(id="defensive-category")
    defensiveCategories = defensiveCategory.find_all("option")
    season = browser.find(id="season-dropdown")
    seasons = season.find_all("option")
    seasonType = browser.find(id="season-type")
    seasonTypes = seasonType.find_all("option")

    
    for role in roles:
        availableCategories = None
        if role.text == "Offense":
            availableCategories = offensiveCategories
        elif role.text == "Defense":
            availableCategories = defensiveCategories
        else:
            print "unknown role"

        for category in availableCategories:
            if category.text == "Category...":
                continue

            for season in seasons:
                if season.text == "Season..." or convertToNumber(removeNewLine(season.text)) < 1960:
                    continue
                #parseSeason(role, category, season, seasonTypes)
                pool.apply_async(parseSeason, (role, category, season, seasonTypes,))

    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time: ' + str(datetime.now()-startTime ))

    closeLogger('main')
예제 #8
0
def build_cache():
    """
    Get current data from the website http://www.lfd.uci.edu/~gohlke/pythonlibs/

    Returns
    -------
    Dictionary containing package details
    """

    data = {}

    soup = RoboBrowser()
    soup.open(MAIN_URL)

    # We mock out a little javascript environment within which to run Gohlke's obfuscation code
    context = js2py.EvalJs()
    context.execute("""
    top = {location: {href: ''}};
    location = {href: ''};
    function setTimeout(f, t) {
        f();
    };
    """)

    # We grab Gohlke's code and evaluate it within py2js
    context.execute(soup.find("script").text)

    links = soup.find(class_="pylibs").find_all("a")
    for link in links:
        if link.get("onclick") is not None:
            # Evaluate the obfuscation javascript, store the result (squirreled away within location.href) into url
            context.execute(link.get("onclick").split("javascript:")[-1])
            url = MAIN_URL + context.location.href

            # Details = [package, version, pyversion, --, arch]
            details = url.split("/")[-1].split("-")
            pkg = details[0].lower().replace("_", "-")

            # Not using EXEs and ZIPs
            if len(details) != 5:
                continue
            # arch = win32 / win_amd64 / any
            arch = details[4]
            arch = arch.split(".")[0]
            # ver = cpXX / pyX / pyXXx
            pkg_ver = details[1]
            py_ver = details[2]

            py_ver_key = py_ver + "-" + arch

            if pkg in data.keys():
                if py_ver_key in data[pkg].keys():
                    data[pkg][py_ver_key].update({pkg_ver: url})
                else:
                    data[pkg][py_ver_key] = {pkg_ver: url}
            else:
                data[pkg] = {py_ver_key: {pkg_ver: url}}

    return data
예제 #9
0
class Question(object):
    """ Zhihu parser, question obj"""
    def __init__(self, page_url):
        self.url = page_url
        self.browser = RoboBrowser(history=True, user_agent='nemo1')
        self.browser.open(self.url)

    def get_answer_count(self):
        if self.browser.find("h3", id="zh-question-answer-num") != None:
            return int(
                self.browser.find("h3",
                                  id="zh-question-answer-num")["data-num"])

    def get_all_answer_url_list(self):
        results = []
        if self.get_answer_count() <= 10:
            for answer_div in self.browser.find_all(
                    "div", class_="zm-item-answer  zm-item-expanded"):
                results.append(URL_PREFIX + answer_div.find("link")["href"])
        else:
            for i in range(0, (self.get_answer_count() / 10) + 1):
                offset = i * 10
                if i == 0:
                    for answer_div in self.browser.find_all(
                            "div", class_="zm-item-answer  zm-item-expanded"):
                        results.append(URL_PREFIX +
                                       answer_div.find("link")["href"])
                    # print results
                else:
                    # pass
                    post_url = "http://www.zhihu.com/node/QuestionAnswerListV2"
                    _xsrf = self.browser.find("input",
                                              attrs={'name': '_xsrf'})["value"]
                    params = json.dumps({
                        "url_token":
                        int(self.url[-8:-1] + self.url[-1]),
                        "pagesize":
                        10,
                        "offset":
                        offset
                    })
                    data = {'_xsrf': _xsrf, 'method': "next", 'params': params}
                    header = {
                        'User-Agent':
                        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
                        'Host': "www.zhihu.com",
                        'Referer': self.url
                    }
                    r = requests.post(post_url,
                                      data=data,
                                      headers=header,
                                      verify=False)
                    answers = r.json()["msg"]
                    # print len(answers)
                    # pdb.set_trace()
                    for ans in answers:
                        soup = BeautifulSoup(ans, 'html.parser')
                        results.append(URL_PREFIX + soup.find("link")["href"])
        return results
예제 #10
0
파일: bscrape.py 프로젝트: baldegg/bscrape
def scrape(q):

	query = q
	ph = re.compile('(\(\d{3}\)\ \d{3}-\d{4})')
	ad = re.compile('[A-Z]{2}\ (\d{5})')
	site = re.compile('(?<=\?q=).*(?=&sa)')
	result = {
	'name':'!NO DATA!',
	'address':'!NO DATA!',
	'phone':'!NO DATA!',
	'website':'!NO DATA!',
	'blurb':'!NO DATA!'
	}
	#uses mechanize to submit google search
	browser = RoboBrowser(user_agent='Firefox', parser='html.parser')
	browser.open('http://google.com/')
	
	# Search for Porcupine Tree
	form = browser.get_form(action='/search')
	form                # <RoboForm q=>
	form['q'].value = query

	browser.submit_form(form, form.submit_fields['btnG'])




	result['query']=query
	if browser.find("div", {"class" : "_B5d"}):
		result['name'] = browser.find("div", {"class" : "_B5d"}).text.encode('utf-8')
		stuff = browser.find("div", {"class" : "_uXc"})

		address = stuff.find(text=ad)
		if address:
			result['address']=address.encode('utf-8')

		phone = stuff.find(text=ph)
		if phone:
			result['phone']=phone.encode('utf-8')

		blurb = stuff.find("span")
		if blurb:
			result['blurb'] = blurb.text.encode('utf-8')

		website = stuff.find("a", string="Website")
		if website:
			website = website.get('href').encode('utf-8')
			result['website'] = site.search(website).group()


	print result
	delay = random.randint(5,10)
	print "Waiting " + str(delay) + " seconds..."
	time.sleep(delay)
	return result
예제 #11
0
def ExtractAllComicImages(url):

    with open('HenRUniqueComic.txt', 'w') as f:
        print(f)

    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(url)

    read_button = browser.find('div', {'class': 'read-now'})

    link = read_button.find('a', href=True)

    ComicFirstPage = link['href']

    browser.open(ComicFirstPage)

    Select_element = browser.find('select', {'class': 'cbo_wpm_pag'})
    options = Select_element.find_all('option')

    page_numbers = (options[-1].text)

    All_Pages = CraftAllComicPages(url, page_numbers)
    '''figure out how many segments we need to split all the pages, 35 pages is one block download'''
    print(len(All_Pages))
    Segments = int(math.ceil(len(All_Pages) / 35))
    print(Segments)
    '''split the list'''
    Segment_list_container = GeneralSplitList(All_Pages, Segments)

    for unique_segment in Segment_list_container:
        jobs = []
        for page in unique_segment:
            print('Starting job for page ' + str(page))
            p = multiprocessing.Process(target=ExtractONEPAGE, args=(page, ))
            jobs.append(p)
            p.start()

        for proc in jobs:
            proc.join()

        time.sleep(2)

    print('Finished all jobs. Now returning them all as unsorted list..')

    with open('HenRUniqueComic.txt', 'r') as f:
        data = f.read().splitlines()

    print('Now sorting them and returning that..')

    sorted_result = SortHenRUniqueComic(data)

    return sorted_result
예제 #12
0
class BKBrowser(object):
    def __init__(self):
        # Browse url :
        self.result = None
        self.browser = RoboBrowser(parser="html.parser")
        self.browser.session.headers = config.headers
        # Mount with custom SSL Adapter
        self.browser.session.mount('https://', HTTPSAdapter())

    def _connect(self):
        # Get to website
        print("- Connecting to url ...")
        self.browser.open(config.url)

    def _skip_first_page(self):
        button = self.browser.get_forms()[0]
        self.browser.submit_form(button)

    # Let's fill in the proper form !
    def _fill_form(self):
        while not self.browser.find('p', {'class': 'ValCode'}):
            inputs_map = max_radio_map(self.browser)
            f = self.browser.get_forms()[0]
            for i in f.keys():
                if f[i].value == '':
                    answers_list = inputs_map.get(i, ['1'])
                    f[i].value = random.choice(answers_list)
            f.serialize()
            self.browser.submit_form(f)

    def _fill_date_form(self):
        # Fill in Date/Time form and start the Questionnaire
        print("- Filling Forms Randomly ...")
        form = self.browser.get_forms()[0]
        form['JavaScriptEnabled'].value = '1'
        form['SurveyCode'].value = config.ID
        form['InputMonth'].value = config.date[0]
        form['InputDay'].value = config.date[1]
        form['InputHour'].value = config.time[0]
        form['InputMinute'].value = config.time[1]
        form.serialize()
        self.browser.submit_form(form)

    def get_validation_code(self):
        self._connect()
        self._skip_first_page()
        self._fill_date_form()
        self._fill_form()
        self.result = self.browser.find('p', {'class': 'ValCode'}).text
        return self.result

    def return_result(self):
        return self.result
예제 #13
0
def gettab(keyword):
    browser = RoboBrowser(history=True, parser='html5lib')
    browser.open('https://www.tabs4acoustic.com/')

    form = browser.get_form(action=re.compile('recherche'))

    form['FindMe'].value = keyword
    browser.submit_form(form)
    div_resultat = browser.find('div', id='page_content')
    browser.follow_link(div_resultat.find('a'))
    tab = browser.find('div', id='tab_zone')
    return tab.find('pre').text
예제 #14
0
class Downloader():
    def __init__(self, proxy=None, worker_num=0):
        self.worker_num = worker_num
        session = Session()
        if proxy is not None:
            session.proxies = {'http': proxy, 'https': proxy}
        self.browser = RoboBrowser(history=True,
                                   parser='html.parser',
                                   session=session)

    def get_download_link(self, book_url):
        self.browser.open(book_url)
        for link in self.browser.find_all("a"):
            if "download.php?t=1" in str(link):
                return f"https://www.lectulandia.cc{link['href']}"

    def download_book(self, download_url):
        self.browser.open(download_url)
        pattern = re.compile("var linkCode = \"(.*?)\";")
        section = pattern.findall(str(self.browser.parsed))
        bee_url = f'https://www.beeupload.net/file/{section[0]}'
        self.browser.open(bee_url)
        try:
            filename = self.browser.find(
                "div", id="fileDescription").find_all("p")[1].text.replace(
                    "Name: ", "")

            size = self.browser.find(
                "div", id="fileDescription").find_all("p")[2].text
            file_url = self.browser.find("a", id="downloadB")
            time.sleep(2)
            self.browser.follow_link(file_url)
            with open(f"books/{filename}", "wb") as epub_file:
                epub_file.write(self.browser.response.content)
            return filename, size
        except:
            print(self.browser.parsed)

    def get_book_page_list(self, page):
        self.browser.open(f'https://www.lectulandia.cc/book/page/{page}/')
        return [
            f"https://www.lectulandia.cc{book['href']}"
            for book in self.browser.find_all("a", class_="card-click-target")
        ]

    def download_full_page(self, page):
        print(f"Downloading page: {page} ")
        books = self.get_book_page_list(page)
        for book in books:
            time.sleep(2)
            download_url = self.get_download_link(book)
            print(f"Worker: {self.worker_num} on page: {page}",
                  self.download_book(download_url))
예제 #15
0
class BKBrowser(object):
    def __init__(self):
        # Browse url :
        self.result = None
        self.browser = RoboBrowser(parser="html.parser")
        self.browser.session.headers = config.headers
        # Mount with custom SSL Adapter
        self.browser.session.mount('https://', HTTPSAdapter())

    def _connect(self):
        # Get to website
        print("- Connecting to url ...")
        self.browser.open(config.url)

    def _skip_first_page(self):
        button = self.browser.get_forms()[0]
        self.browser.submit_form(button)

    # Let's fill in the proper form !
    def _fill_form(self):
        while not self.browser.find('p', {'class': 'ValCode'}):
            inputs_map = max_radio_map(self.browser)
            f = self.browser.get_forms()[0]
            for i in f.keys():
                if f[i].value == '':
                    answers_list = inputs_map.get(i, ['1'])
                    f[i].value = random.choice(answers_list)
            f.serialize()
            self.browser.submit_form(f)

    def _fill_date_form(self):
        # Fill in Date/Time form and start the Questionnaire
        print("- Filling Forms Randomly ...")
        form = self.browser.get_forms()[0]
        form['JavaScriptEnabled'].value = '1'
        form['SurveyCode'].value = config.ID
        form['InputMonth'].value = config.date[0]
        form['InputDay'].value = config.date[1]
        form['InputHour'].value = config.time[0]
        form['InputMinute'].value = config.time[1]
        form.serialize()
        self.browser.submit_form(form)

    def get_validation_code(self):
        self._connect()
        self._skip_first_page()
        self._fill_date_form()
        self._fill_form()
        self.result = self.browser.find('p', {'class': 'ValCode'}).text
        return self.result

    def return_result(self):
        return self.result
예제 #16
0
def get_source_code(commitId, project):
    import random
    import requests
    from robobrowser import RoboBrowser

    HEADERS_LIST = [
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
        'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
    ]

    link = []

    session = requests.Session()
    browser = RoboBrowser(session=session,
                          user_agent=random.choice(HEADERS_LIST),
                          parser="lxml")
    url = "https://github.com/" + project.replace("-",
                                                  "/") + "/commit/" + commitId

    browser.open(url + "?diff=unified")
    results = browser.find_all("a")
    for item in results:
        if ".java" in str(item):
            second_url = "https://raw.githubusercontent.com/" + project.replace(
                "-", "/") + "/" + commitId + "/" + item.string
            browser.open(second_url)
            return browser.find().text
예제 #17
0
    def get_cookies(self):
        """ opens a fake browser to get the cookies needed """
        from robobrowser import RoboBrowser
        browser = RoboBrowser(
            user_agent=
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5',
            parser='html.parser')
        browser.open('https://battlemap.deltatgame.com/home#')
        link = browser.find('a')
        browser.follow_link(link)
        form = browser.get_form(0)

        with open('battlecreds.json') as credentialfile:
            credentials = json.load(credentialfile)
            form['Email'] = credentials['email']
            browser.submit_form(form)
            form = browser.get_form(0)
            form['Passwd'] = credentials['password']
            browser.submit_form(form)
            browser.open('https://battlemap.deltatgame.com/home')

        self.battlemap_token = browser.session.cookies.get('battlemap_session')
        self.xsrf = browser.session.cookies.get('XSRF-TOKEN')
        self.cookietimeout = time.time() + 60 * 60 * 1.95
        # GET csrf-token META HERE
        self.csrf = ''
        self.brow = browser
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(str(browser.parsed()), "html.parser")
        for tag in soup.find_all('meta'):
            if 'name' in tag.attrs and tag.attrs['name'] == 'csrf-token':
                self.csrf = tag.attrs['content']
예제 #18
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5, 3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,
                          parser='html.parser',
                          user_agent=get_user_agent(logger),
                          timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(
        week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)

    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
예제 #19
0
def praca_shopping():
    from robobrowser import RoboBrowser

    browser = RoboBrowser(parser="html.parser")

    not_finded = 0
    n = 0
    names = set()
    while not_finded < 20:
        # print(f'Página {n}')
        finded = False
        url = f"http://www.pracauberabashopping.com.br/filtro_loja_tipo.asp?tipo=vlojas.asp?busca1={n}"
        browser.open(url)
        item = browser.find("strong")
        if item:
            name = item.text
            if name != "Busca sem resultado.":
                names.add(fixed(name))
                finded = True
        else:
            items = browser.find_all("a")
            if len(items) > 1:
                for item in items[1:]:
                    if item.text != "Resultado da Busca":
                        names.add(fixed(item.text))
                finded = True

        if not finded:
            not_finded += 1

        n += 1
    return names
예제 #20
0
def Pururin():

    Front_Page_URLS = []
    Front_Page_Img = []

    base_URL = 'http://pururin.us'
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(base_URL)

    gallery = browser.find('ul', {'class': 'gallery-list'})

    all_links = gallery.find_all('a', href=True)
    all_image = gallery.find_all('img', src=True)

    count = 0
    while (count < len(all_links)):

        if 'gallery' in all_links[count]['href']:
            Front_Page_URLS.append(base_URL + all_links[count]['href'])
            Front_Page_Img.append(base_URL + all_image[count]['src'])

        count += 1

    return (Front_Page_URLS, Front_Page_Img)
def GameUpdatesForum():
    '''
    This will return as a list the latest aries updates from the main forums
    '''
    MyUpdateList = []
    base_url = 'http://elluel.net/'

    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(base_url)

    form = browser.get_form(id='navbar_loginform')

    form["vb_login_username"] = '******'
    form["vb_login_password"] = '******'

    browser.submit_form(form)

    browser.open(
        'http://elluel.net/showthread.php?15877-AriesMS-Official-Update-Fix-Log'
    )

    MyLinks = browser.find('div', {'class': 'spoiler'}).find_all('li')

    for line in MyLinks:
        MyUpdateList.append(line.text)

    return MyUpdateList
async def AlertWhenServerUp(MyVar2):
    '''
    This will run as a background task and an be called mutliple times by different people. It accepts the channel id and the user name who called it through on_message. It will crawl aries homepage and check to see if it is online every minute. If it is, then the infinite loop will exit and a message to the user who called this will be sent saying that servers are up
    '''
    await client.wait_until_ready()
    while True:
        ''' This loop will continue looping and scraping aries front website to see if server is online. If it is, then it will break the loop and alert person who called this func'''
        url = 'http://aries.elluel.net/'

        browser = RoboBrowser(history=True,
                              parser='html.parser',
                              user_agent='Chrome/41.0.2228.0')
        browser.open(url)

        MyButton = browser.find('button')

        if 'ONLINE' in MyButton.text:
            break
        await asyncio.sleep(
            60)  # task runs every 60 seconds or the duration provided
    ''' Sending the alert message to the author since servers are up now'''
    await client.send_message(
        MyVar2,
        'The servers are up now. Hope I was useful to you. If you have any other useful alert ideas then DM them to me. Enjoy!'
    )
예제 #23
0
def main():
    # Browse to Rap Genius
    browser = RoboBrowser(history=True)
    browser = RoboBrowser(
        parser="html.parser")  # will get a warning if parser not declared
    browser.open('http://rapgenius.com/')

    # Search for Queen
    form = browser.get_form(action='/search')
    form  # <RoboForm q=>
    form['q'].value = 'queen'
    browser.submit_form(form)

    # Look up the first song
    songs = browser.select('.song_name')
    try:
        browser.follow_link(songs[0])
    except IndexError:
        print("Songs Index doesn't exist!")
        return
    lyrics = browser.select('.lyrics')
    try:
        lyrics[0].text  # \n[Intro]\nIs this the real life...
    except IndexError:
        print("Lyrics Index doesn't exist!")

    # Back to results page
    browser.back()

    # Look up my favorite song
    browser.follow_link('death on two legs')

    # Can also search HTML using regex patterns
    lyrics = browser.find(class_=re.compile(r'\blyrics\b'))
    print(lyrics.text)  # \n[Verse 1]\nYou suck my blood like a leech...
예제 #24
0
파일: FenH.py 프로젝트: Selezar211/sdfsdf
def FenHen(keyword):

    result_urls = []

    base_URL = 'http://fenhentai.blogspot.co.uk/search?q=' + keyword
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')

    while True:
        browser.open(base_URL)

        post_body_list = browser.find_all('div',
                                          {'class': 'post-body entry-content'})

        for post in post_body_list:
            this_image = post.find('img', src=True)
            print(this_image['src'])
            result_urls.append(this_image['src'])

        Next_Post_Link = browser.find('a', {'class': 'blog-pager-older-link'},
                                      href=True)

        if (Next_Post_Link == None):
            break
        else:
            base_URL = Next_Post_Link['href']

    return result_urls
예제 #25
0
def sign_in(username, password):
    """
    Signs into the DOH website and sets the global session
    to allow other browser instances to access the cookies
    :param username: the username to login with
    :param password: the password to login with
    """
    # If already logged in, don't log in again
    global global_session
    if global_session is not None:
        return True
    # Create Non-JS browser
    browser = RoboBrowser(parser='html.parser')
    # Open login page
    browser.open('https://doh.arcabc.ca/user/login')
    # Get the login form
    form = browser.get_form(id='user-login')
    # Set the username & password
    form['name'].value = username
    form['pass'].value = password
    # Submit the form
    browser.submit_form(form)
    # If successfully signed in
    h1 = browser.find(class_='page__title')
    if h1.text == username:
        # Set the global session
        global_session = browser.session
        return True
    else:
        return False
예제 #26
0
def find_download_page(podcast, episode):
    download_base = 'https://www.trancepodcasts.com/download/'
    browser = RoboBrowser(history=True)
    browser.open('https://www.trancepodcasts.com/download/{:s}-{:d}/'.format(
        podcast, episode))

    link = browser.find('a', attrs={'rel': 'nofollow', 'class': 'btn'})
    browser.follow_link(link)
    browser.response
예제 #27
0
def usingurllib():
    url = 'https://www.screener.in/'
    rb = RoboBrowser(history=True, parser="html.parser")
    rb.open(url)
    f = {rb.find(placeholder="Company search...").value: 'PCPL'}
    post_args = urllib.parse.urlencode(f)
    fp = urllib.request.urlopen(url, post_args)
    soup = BeautifulSoup(fp)
    print(soup)
예제 #28
0
 def get_digitised_pages(self, entity_id=None):
     '''
     Returns the number of pages (images) in a digitised file.
     Note that you don't need a session id to access these pages,
     so there's no need to go through get_url().
     '''
     # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id)
     url = 'https://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id)
     br = RoboBrowser(parser='lxml')
     br.open(url)
     try:
         pages = int(br.find('span', attrs={'id': "lblEndPage"}).string)
     except AttributeError:
         if br.find('span', attrs={'id': "lblCitation"}):
             pages = 1
         else:
             pages = 0
     return pages
예제 #29
0
 def get_digitised_pages(self, entity_id=None):
     '''
     Returns the number of pages (images) in a digitised file.
     Note that you don't need a session id to access these pages,
     so there's no need to go through get_url().
     '''
     # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id)
     url = 'https://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id)
     br = RoboBrowser(parser='lxml')
     br.open(url)
     try:
         pages = int(br.find('span', attrs={'id': "lblEndPage"}).string)
     except AttributeError:
         if br.find('span', attrs={'id': "lblCitation"}):
             pages = 1
         else:
             pages = 0
     return pages
예제 #30
0
def parseWeek(year, week):
    """
    parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1
    which contains a csv of the fan duel player prices
    stores this info in fanduel_prices collection
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_fanduel_prices = db['fanduel_prices']

    if col_fanduel_prices.find({'year': year, 'yeek': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10)
    url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year)
    browser = open_or_follow_link(logger, browser, 'open', url)

    docs = []
    try:
        data = browser.find('pre').text
        lines = data.split('\n')
        header = lines[0]
        header = header.split(';')
        lines = lines[1:]
        for line in lines:
            doc = {}
            if not line:
                continue
            for index, each in enumerate(line.split(';')):
                doc[cleanKey(header[index])] = convertToNumber(each)
            docs.append(doc)
    except:
        logger.exception("Parse fail: %s", url)
    
    try:
        logger.debug('Bulk Creating docs')
        col_fanduel_prices.insert_many(docs)
    except:
        logger.exception('insert_many error')

    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
예제 #31
0
파일: luscio.py 프로젝트: Selezar211/sdfsdf
def FetchFirstImage(url):

    domain = 'https://luscious.net'
    browser = RoboBrowser(history=True,
                          parser='html.parser',
                          user_agent='Chrome/41.0.2228.0')
    browser.open(url)

    wrapper = browser.find('div', {'class': 'album_cover_item'})

    link = domain + wrapper.find('a', href=True)['href']

    browser.open(link)

    container = browser.find('div', {'class': 'ic_container'})

    img_link = container.find('img', src=True)

    return img_link['src']
예제 #32
0
def solved_captcha(page):
    soup = BeautifulSoup(page, 'lxml')
    if soup.find('form', action='/captcha-form') is None:
        return True
    browser = RoboBrowser()
    browser.open(BASE_URL)
    form = browser.get_form(action='/captcha-form')
    captcha_url = '%s%s' % (BASE_URL, browser.find('img').get('src'))
    answer = get_captcha_answer(captcha_url)
    form['captcha[input]'].value = answer
    browser.submit_form(form)
    return False
예제 #33
0
    def test_cleanup_interface(self):
        # Init object
        browser = RoboBrowser(history=True, parser='html.parser')
        browser.open(TEST_URL)

        # Find cleanup form
        cleanup_form = browser.get_form(action='/cleanup')
        self.assertTrue(cleanup_form)

        # Cleanup DB using form
        browser.submit_form(cleanup_form)
        self.assertTrue(browser.find(text="Database cleared"))
예제 #34
0
def browser_stuff():
    browser = RoboBrowser(parser='html.parser')
    browser.open('http://www-wfau.roe.ac.uk/sss/pixel.html')
    form = browser.get_form()
    form['coords'].value = "00 05 53.9 -34 45 08"
    form['size'].value = "15"
    form['equinox'].value = "1"
    print(form['waveband'].options)
    browser.submit_form(form)
    download_link = str(browser.find("a"))
    download_link = download_link.split(" ")[1].split("\"")[1]
    return download_link
예제 #35
0
def scrape_revigo_csv(input_GOstats_tsv,
                      out_file,
                      pvalue_cutoff=0.05,
                      fdr_cutoff=1.0):
    """ 
    """
    oh = open(out_file, "w")

    # get input goterms from GOstats result
    goterms = GOstats2Revigo(input_GOstats_tsv,
                             pvalue_cutoff=pvalue_cutoff,
                             fdr_cutoff=fdr_cutoff,
                             output_column=3)
    if goterms:
        br = RoboBrowser(parser="lxml")
        br.open("http://revigo.irb.hr/")

        form = br.get_form()
        #print(form)
        form["goList"].value = goterms

        br.submit_form(form)

        download_rsc_link = br.find("a", href=re.compile("toR.jsp"))
        br.follow_link(download_rsc_link)
        #r_code = br.response.content.decode("utf-8")
        #print(r_code)

        br.back()

        download_csv_link = br.find("a", href=re.compile("export.jsp"))
        br.follow_link(download_csv_link)
        csv_content = br.response.content.decode("utf-8")
        oh.write(csv_content)
    else:
        oh.write(
            "term_ID,description,frequency,plot_X,plot_Y,plot_size,log10 p-value,userVal_2,uniqueness,dispensability,representative,eliminated"
        )

    oh.close()
예제 #36
0
class RoboBrowserTestCase(StaticLiveServerTestCase, base.AbstractBrowser):
    def setUp(self):
        super().setUp()
        self.browser = RoboBrowser(history=True, parser='html.parser')

    def load(self, url):
        self.browser.open(self.live_server_url + url)

    def get_title(self):
        return self.browser.find('title').text

    def get_form(self, selector):
        return RoboBrowserForm(self.browser, selector)
def get_webdav_urls(username, password):

    # log in

    browser = RoboBrowser(history=True)
    browser.open('http://ctools.umich.edu')
    browser.follow_link(browser.find(id='ctoolsLogin'))

    login_form = browser.get_form()
    login_form['login'].value = username
    login_form['password'].value = password
    browser.submit_form(login_form)

    # get the results

    browser.follow_link(browser.find(
        class_='toolMenuLink ',
        title='For creating, revising, and deleting course and project sites'
    ))
    browser.open(browser.find(class_='portletMainIframe').attrs['src'])

    results = []

    course_links = browser.select('#sitesForm td h4 a[target="_top"]')
    for course_link in course_links:

        if not course_link.attrs:
            continue
        href = course_link.attrs['href']
        if '~' in href:
            continue

        results.append(
            'https://ctools.umich.edu/dav' +
            findall('\/[^\/]+$', href)[0]
        )

    return results
예제 #38
0
class FakeMail(object):
    def __init__(self):
        self.browser = RoboBrowser(history=True)
        self.browser.open('http://10minutemail.com/')
        with open('10minmail.txt', 'w') as f:
            f.write(str(self.browser.parsed))
        if self.browser.get_link('Blocked'):
            raise BlockedException('to many login Attempts')


    def get_address(self):
        address = self.browser.find("div", {"id": "copyAddress"})
        print address

    def read_mail(self):
        pass
예제 #39
0
파일: pipwin.py 프로젝트: pbrod/pipwin
def build_cache():
    """
    Get current data from the website http://www.lfd.uci.edu/~gohlke/pythonlibs/

    Returns
    -------
    Dictionary containing package details
    """

    data = {}

    soup = RoboBrowser()
    soup.open(MAIN_URL)
    links = soup.find(class_="pylibs").find_all("a")
    for link in links:
        if link.get("onclick") is not None:
            jsfun = link.get("onclick").split('"')
            mlstr = jsfun[0].split("(")[1].strip()[1:-2]
            ml = list(map(int, mlstr.split(",")))
            mi = jsfun[1]
            url = parse_url(ml, mi)

            # Details = [package, version, pyversion, --, arch]
            details = url.split("/")[-1].split("-")
            pkg = details[0].lower().replace("_", "-")

            # Not using EXEs and ZIPs
            if len(details) != 5:
                continue
            # arch = win32 / win_amd64 / any
            arch = details[4]
            arch = arch.split(".")[0]
            # ver = cpXX / pyX / pyXXx
            pkg_ver = details[1]
            py_ver = details[2]

            py_ver_key = py_ver + "-" + arch

            if pkg in data.keys():
                if py_ver_key in data[pkg].keys():
                    data[pkg][py_ver_key].update({pkg_ver: url})
                else:
                    data[pkg][py_ver_key] = {pkg_ver: url}
            else:
                data[pkg] = {py_ver_key: {pkg_ver: url}}

    return data
예제 #40
0
class Tracker(object):
    def __init__(self):
        self.browser = RoboBrowser(history=True, parser='html.parser')
        self.login()

    def login(self):
        self.browser.open(TRACKER_LOGIN)
        login_form = self.browser.get_form(id='new_member')

        login_form["member[login]"] = settings.TRACKER_LOGIN
        login_form["member[password]"] = settings.TRACKER_PASSWORD
        self.browser.session.headers['Referer'] = TRACKER
        self.browser.submit_form(login_form)

    def create_event(self, event, description):
        try:
            self.browser.open(TRACKER_EVENT)

            event_form = self.browser.get_form(id='new_event')

            event_form["event[title]"] = event["event_name"]
            event_form["event[contact_name]"] = event["contact"]
            event_form["event[contactemail]"] = event["email"]
            event_form[EVENT_DATE + "[description]"] = "Show"
            event_form[EVENT_DATE + "[location_ids][]"] = "70"

            start_date = event["start_date"]

            event_form[EVENT_DATE + "[startdate(1i)]"] = str(start_date.year)
            event_form[EVENT_DATE + "[startdate(2i)]"] = str(start_date.month)
            event_form[EVENT_DATE + "[startdate(3i)]"] = str(start_date.day)
            event_form[EVENT_DATE + "[startdate(5i)]"] = "00"
            event_form[EVENT_DATE + "[enddate(1i)]"] = str(start_date.year)
            event_form[EVENT_DATE + "[enddate(2i)]"] = str(start_date.month)
            event_form[EVENT_DATE + "[enddate(3i)]"] = str(start_date.day)
            event_form[EVENT_DATE + "[enddate(5i)]"] = "05"
            event_form["event[notes]"] = description

            self.browser.submit_form(event_form)

            if "errors prohibited" in str(self.browser.parsed):
                return self.browser.find(id="errorExplanation")
            else:
                return self.browser.url
        except Exception as e:
            return "EXCEPTION! " + str(e)
예제 #41
0
def get_email_by_cin(cin):
    url = 'http://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do'
    browser = RoboBrowser()
    browser.session.headers['User-Agent'] = random.choice(user_agents)
    browser.open(url)
    form = browser.get_forms()[-1]
    form['companyID'].value = cin
    browser.submit_form(form)
    table = browser.find('table', attrs={'class': 'result-forms'})
    if not table:
        return None
    email_header = table.find('td', text='Email Id')
    if not email_header:
        return None
    email_row = email_header.findNext('td')
    email = str.strip(email_row.text)
    return email.lower()
def main():
    args = docopt(__doc__, version="dailyprogrammer-dl v{}".format(__version__))

    # Configure logging
    logLevel = logging.INFO #default
    if args['--verbose']:
        logLevel = logging.DEBUG
    elif args['--quiet']:
        logLevel = logging.ERROR

    logging.basicConfig(format='%(levelname)s: %(message)s', level=logLevel)
    logging.debug(args)

    # Process command line arguments
    challengeURL = args['<challengeurl>']

    # Parse project page for title and description
    logging.info("Parsing daily challenge: {}".format(challengeURL))
    browser = RoboBrowser()
    browser.session.headers['User-Agent'] = "dailyprogrammer-dl v{} by /u/zod77".format(__version__)
    browser.open(challengeURL)
    title = browser.find('a',class_='title').string
    description = browser.find_all('div',class_="md")
    description = description[1]
    descriptionHTML = "".join(str(t) for t in description.contents) # remove outer <div>

    projectName = generateProjectName(title)

    # Init project skeleton
    logging.info("Generating project")
    projectPath = os.path.abspath(projectName)
    os.mkdir(projectPath)

    # Write out project files
    pyTemplate = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"boilerplate.txt"))
    shutil.copy(pyTemplate, os.path.join(projectPath,"{}.py".format(projectName)))

    # Generate README.md
    h = html2text.HTML2Text()
    descriptionMD = h.handle(descriptionHTML)
    readme = os.path.join(projectPath,"README.md")
    with open(readme, "w") as f:
        f.write(descriptionMD)

    return
예제 #43
0
def testProxy(proxy):
    """
    Tests a proxy with api.ipify.org
    If the proxy fails, it retries 20 more times.
    This is because free proxies are unreliable at times.
    """
    tries = 0
    browser = RoboBrowser(history=False,  parser='html5lib', timeout=10)
    while(True):
        try:
            tries += 1
            browser.open("http://api.ipify.org", proxies={'http': proxy})
            if browser.find('body').text != row['IP Address']:
                raise Exception('Failed')
            return row
        except:
            if tries > 20:
                return None
예제 #44
0
def gatherData(user, password):
    baseURL = 'https://sigarra.up.pt/feup/pt/'
    browser = RoboBrowser(history=True, parser='html.parser')
    browser.open(baseURL + 'web_page.Inicial')

    # Gets the login form
    form = browser.get_form(action=re.compile(r'validacao'))

    # Updates the login form with the user credentials
    form['p_user'].value = 'up' + user
    form['p_pass'].value = password

    browser.submit_form(form)

    # Goes to the user profile
    browser.open(baseURL + 'fest_geral.cursos_list?pv_num_unico=' + user)

    # Opens the extended view
    extended = browser.find(title='Visualizar informações no contexto do curso')
    browser.follow_link(extended)

    credits = []
    grades = []

    # For each html class containing grades ("i", "p" and "o"), gather data
    for i in browser.find_all(class_='i'):
        if i.find(class_='n aprovado'):
            credits.append(i.find(class_='k n').text)
            grades.append(i.find(class_='n aprovado').text)

    for j in browser.find_all(class_='p'):
        if j.find(class_='n aprovado'):
            credits.append(j.find(class_='k n').text)
            grades.append(j.find(class_='n aprovado').text)

    for k in browser.find_all(class_='o'):
        if k.find(class_='n aprovado'):
            credits.append(k.find(class_='k n').text)
            grades.append(k.find(class_='n aprovado').text)

    return credits, grades
예제 #45
0
def parseTeam(team_url, team_name):
    """
    parses a teams page returns a list of year urls
    there is some data on this page that would be usefull to scrape in the future
    """
    logger = makeLogger(cleanKey(team_name), r"./logs_pfrTeamStats/")

    startTime = datetime.now()

    logger.debug("Starting %s", team_name)

    wait = random.uniform(1.5, 3.5)
    logger.debug("Waiting %f", wait)
    time.sleep(wait)

    logger.debug("Opening main page")
    browser = RoboBrowser(history=False, parser="html5lib", user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, "open", team_url)
    table = browser.find(id="team_index").find("tbody")
    year_columns = table.find_all("th")

    year_url_tups = []
    for index, year_column in enumerate(year_columns):
        logger.debug("Row %d of %d", index, len(year_columns))
        try:
            year_link = year_column.find("a")
            if year_link:
                year_url = "http://www.pro-football-reference.com" + year_link["href"]
                year = convertToNumber(year_link.text)
                if not isinstance(year, int):
                    continue
                year_url_tups.append((team_name, year_url, year))
        except:
            logger.exception(year_column)

    logger.debug("parseTeam time elapsed: " + str(datetime.now() - startTime))

    closeLogger(logger)

    return year_url_tups
예제 #46
0
def run(wait):
    """
    """
    logger = makeLogger('main', r'./logs_pfrPlayerStats/')

    startTime = datetime.now()
    
    logger.debug('start time: ' + str(startTime))
     
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    
    player_tuples = []
    for letter in list(string.ascii_uppercase):
        wait = random.uniform(.5,1.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Opening players %s', letter)
        browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/players/{}/".format(letter))
        players = browser.find(id="div_players")

        for player in players.find_all('p'):
            player = player.find('a')
            player_tuples.append((player.text, player['href']))

    pool = Pool(processes=int(get_proxy_count()/2.5))

    logger.debug('Processing %d Players', len(player_tuples))
    for player_tuple in player_tuples:
        #parsePlayer(player_tuple[0], player_tuple[1])
        pool.apply_async(parsePlayer, (player_tuple[0], player_tuple[1],))


    pool.close() #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.
    pool.join() #Wait for the worker processes to exit. One must call close() or terminate() before using join().

    logger.debug('run time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(logger)
예제 #47
0
파일: captmb.py 프로젝트: TeamBeaver/webar
def scrap(url):

        browser = RoboBrowser(user_agent='i am tool')
        browser.open(url)
        a = browser.find(class_='captcha') ##machine learning would be great for class prediction
        fullsrc = url[:-1] + a['src']
        request.urlretrieve(fullsrc, "captcha.jpg")
        ##tesseract buraya gelecek


        ##tam buraya işte aha
        form = browser.get_form(action=re.compile(r'.'))

        # Fill it out
        form['name'].value = 'namaaaeee'

        form['password'].value = '*****@*****.**'
        form['password2'].value = 'teambeaver'
        form['captcha_1'].value = '1234'

        # Submit the form
        browser.submit_form(form)

        print(browser.response)
예제 #48
0
def connect(request, mmg):
    """
    Login to MyMedicare.gov using RoboBrowser
    :param request:
    :param username:
    :param password:
    :return:

    """
    mmg_back = mmg
    mmg_back['status'] = "FAIL"
    PARSER = BS_PARSER
    if not PARSER:
        logger.debug('Default Parser for BeautifulSoup:', 'lxml')
        PARSER = 'lxml'

    login_url = 'https://www.mymedicare.gov/default.aspx'

    # This is for testing. Next step is to receive as parameters
    username = mmg['mmg_user']  # 'MBPUSER202A'
    # password = '******'# 'CMSPWD2USE'
    password = mmg['mmg_pwd']  # 'CMSPWD2USE'

    # Call the default page
    # We will then want to get the Viewstate and eventvalidation entries
    # we need to submit them with the form
    rb = RoboBrowser()
    mmg_back['robobrowser'] = rb

    # Set the default parser (lxml)
    # This avoids BeautifulSoup reporting an issue in the console/log
    rb.parser = PARSER

    # Open the form to start the login
    rb.open(login_url)

    # Get the form content
    form = rb.get_form()

    # if settings.DEBUG:
    #    print("Page:", rb)

    # We will be working with these form fields.
    # Set them as variables for easier re-use
    form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword"
    form_usr = "******"
    form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree"
    # sign_in = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn"
    # EVENTTARGET = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SignIn"
    form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \
                      "CreateAccount"

    # Set the form field values
    form.fields[form_usr].value = username
    form.fields[form_pwd].value = password

    # There is a javascript popup after hitting submit
    # It seems to set the following field to "True"
    # Default in form is "False"
    form.fields[form_agree].value = "True"

    # Remove the CreateAccount field. It seems to drive the form
    # to the registration page.
    form.fields.pop(form_create_acc)

    # Capture the dynamic elements from these damned aspnetForms
    # We need to feed them back to allow the form to validate
    VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value
    EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value
    VIEWSTATE = form.fields['__VIEWSTATE']._value

    # if settings.DEBUG:
    #     print("EventValidation:", EVENTVALIDATION )
    #     print("ViewStateGenerator:", VIEWSTATEGENERATOR)

    # Set the validator fields back in to the form
    form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR
    form.fields['__VIEWSTATE'].value = VIEWSTATE
    form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION

    # Prepare the form for submission
    form.serialize()

    # logger.debug("serialized form:", form)

    # submit the form
    rb.submit_form(form)

    # logger.debug("RB:", rb, "\nRB:", rb.__str__())

    browser = RoboBrowser(history=True)
    if browser:
        pass
    # browser.parser = PARSER

    # logger.debug("Browser History:", browser.history,
    #              "\nBrowser parser:", browser.parser,
    #              # "\nPage html:", rb.parsed
    #              )

    if not rb.url == "https://www.mymedicare.gov/dashboard.aspx":
        err_msg = rb.find("span",
                          {"id": "ctl00_ContentPlaceHolder1"
                                 "_ctl00_HomePage_lblError"})
        if err_msg:
            err_msg = err_msg.contents
            messages.error(request, err_msg)
        messages.error(request, "We had a problem connecting to your"
                                "Medicare account")
        mmg_back['status'] = "FAIL"
        mmg_back['url'] = rb.url
        return mmg_back

    # <ul id="headertoolbarright">
    #    <li class="welcometxt" id="welcomeli">Welcome, JOHN A DOE </li>
    my_name = rb.find("li", {"id": "welcomeli"})
    if my_name:
        my_name = my_name.contents[0].replace("Welcome, ", "")
    my_account = rb.find("div", {"id": "RightContent"})
    if my_account:
        my_account = my_account.prettify()
        my_account = my_account.replace('href="/',
                                        'target="_blank" '
                                        'href="https://www.mymedicare.gov/')
        # my_account = my_account.contents
    # href="/mymessages.aspx"
    # href="/myaccount.aspx"
    # href="/plansandcoverage.aspx"
    # my_account.str('href="/mymessages.aspx',
    #                'href="https://www.mymedicare.gov/mymessages.apsx')
    # my_account.str('href="/myaccount.aspx',
    #                'href="https://www.mymedicare.gov/myaccount.aspx')
    # my_account.str('href="/plansandcoverage.aspx',
    #                'href="https://www.mymedicare.gov/plansandcoverage.aspx')

    # if settings.DEBUG:
    #     print("\nMyAccount:", len(my_account), "|", my_account)

    # Need to pass data to context and then render to different
    # template with some data retrieved from MyMedicare.gov
    # If successfully logged in, Or return an error message.
    mmg_back['status'] = "OK"
    mmg_back['url'] = rb.url
    mmg_back['mmg_account'] = my_account
    mmg_back['mmg_name'] = my_name

    mmg_back['robobrowser'] = rb

    # logger.debug("RB post sign-in:", rb,
    #              "rb url:", rb.url)

    return mmg_back
예제 #49
0
파일: mti.py 프로젝트: jmcarp/nih-trends
class Submitter:

    def __init__(self):
        self.browser = RoboBrowser(parser='html5lib')

    def login(self):
        self.browser.open(MTI_BATCH_URL)
        form = self.browser.get_form('fm1')
        form['username'] = config.MTI_USERNAME
        form['password'] = config.MTI_PASSWORD
        self.browser.submit_form(form)

    def submit(self, batch_id):
        logger.info('Submitting MTI batch {:04}'.format(batch_id))
        batch = session.query(MtiBatch).filter_by(id=batch_id).one()
        path = get_batch_file(batch_id, 'abstracts')
        self.browser.open(MTI_BATCH_URL)

        form = self.browser.get_form()
        form['Email_Address'] = config.MTI_EMAIL
        form['BatchNotes'] = config.MTI_EMAIL
        form['UpLoad_File'] = open(path)
        form['Filtering'] = ''
        form['SingLinePMID'] = 'Yes'
        form['Output'] = 'detail'
        self.browser.submit_form(form)

        # Confirm submit
        js = self.browser.find('script').text
        param = MTI_SCHEDULE_RE.search(js).groups()[0]
        self.browser.open(MTI_CONFIRM_URL + param)

        batch.submitted = True
        batch.path = param
        session.commit()

    def fetch(self, path, batch_id):
        session = self.browser.session
        path = '{}/text.out'.format(path.replace(MTI_PATH_PREFIX, ''))
        url = urljoin(MTI_BASE_URL, path)
        resp = session.get(url, stream=True)
        resp.raise_for_status()
        with open(get_batch_file(batch_id, 'terms'), 'wb') as fp:
            for chunk in resp.iter_content(chunk_size=1024):
                fp.write(chunk)

    @classmethod
    def batch_submit(cls):
        rows = session.query(
            MtiBatch
        ).filter_by(
            submitted=False,
            done=False,
        ).limit(
            MAX_SUBMIT
        )
        if rows:
            submitter = cls()
            submitter.login()
            for batch in rows:
                submitter.submit(batch.id)

    @classmethod
    def batch_fetch(cls):
        submitter = cls()
        submitter.login()
        rows = session.query(
            MtiBatch
        ).filter(
            MtiBatch.submitted == True,  # nqa
            MtiBatch.done == False,  # noqa
            MtiBatch.path != None,  # noqa
        )
        for row in rows:
            try:
                submitter.fetch(row.path, row.id)
                load_batch(row.id)
            except HTTPError as exc:
                logger.exception(exc)
    def authenticate(self, username=None, password=None):
        login_url = 'https://www.mymedicare.gov/default.aspx'
        rb = RoboBrowser()
        rb.parser = 'lxml'
        rb.open(login_url)
        # Get the form content
        form = rb.get_form()
        if settings.DEBUG:
            print("Page:", rb)
        # We will be working with these form fields.
        # Set them as variables for easier re-use
        form_pwd = "ctl00$ContentPlaceHolder1$ctl00$HomePage$SWEPassword"
        form_usr = "******"
        form_agree = "ctl00$ContentPlaceHolder1$ctl00$HomePage$Agree"
        form_create_acc = "ctl00$ContentPlaceHolder1$ctl00$HomePage$lnk" \
                          "CreateAccount"
        # Set the form field values
        form.fields[form_usr].value = username
        form.fields[form_pwd].value = password
        # There is a javascript popup after hitting submit
        # that the form_agree to  "True"
        # Default in form is "False"
        form.fields[form_agree].value = "True"
        # Remove the CreateAccount field. It seems to drive the form
        # to the registration page.
        form.fields.pop(form_create_acc)
        # Capture the dynamic elements from these damned aspnetForms
        # We need to feed them back to allow the form to validate
        VIEWSTATEGENERATOR = form.fields['__VIEWSTATEGENERATOR']._value
        EVENTVALIDATION = form.fields['__EVENTVALIDATION']._value
        VIEWSTATE = form.fields['__VIEWSTATE']._value
        # Set the validator fields back in to the form
        form.fields['__VIEWSTATEGENERATOR'].value = VIEWSTATEGENERATOR
        form.fields['__VIEWSTATE'].value = VIEWSTATE
        form.fields['__EVENTVALIDATION'].value = EVENTVALIDATION
        # Prepare the form for submission
        form.serialize()
        # submit the form
        rb.submit_form(form)
        # If the login was successful then we would be redirected to the dashboard.
        if rb.url == "https://www.mymedicare.gov/dashboard.aspx":
            """The login worked."""
            # Get the name
            my_name = rb.find("li", {"id": "welcomeli"})
            if my_name:
                my_name = my_name.contents[0].replace("Welcome, ", "")

            split_name = my_name.split(' ')
            first_name = split_name[0]
            last_name = split_name[-1]
            if not last_name:
                last_name = split_name[-2]

            try:
                user = User.objects.get(username=username)
            except User.DoesNotExist:
                # Create a new user. Note that we can set password
                # to anything, because it won't be checked; the password
                # from the external backend is checked (coming from settings).
                user = User(username=username, password='******',
                            first_name=first_name,
                            last_name=last_name)
                user.save()
                up, created = UserProfile.objects.get_or_create(user=user,
                                                                user_type='BEN')
                group = Group.objects.get(name='BlueButton')
                user.groups.add(group)

            return user
        # The MyMedicare login failed.
        return None
예제 #51
0
def parseYear(year):
    """
    parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm
    follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info)
    stores schedule info in nfl_data.schedule
    stores game_info in nfl_data.game_info with schedule ids
    """
    logger = makeLogger(year, r'./logs_pfrSchedule/')

    startTime = datetime.now()

    logger.debug('Starting %d', year)

    schedule_list = []
    gameInfo_list = []

    client = MongoClient('localhost', 27017)
    db = client['nfl_data']
    col_schedule = db['schedule']
    col_game_info = db['game_info']
    col_failed_game_info = db['failed_game_info']

    if col_schedule.find({'year': year}).count():
        logger.debug('Already parsed %s', year)
        closeLogger(logger)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://www.pro-football-reference.com/years/{}/games.htm".format(year))
    table = browser.find(id='games')
    rows = table.find_all('tr')
    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        try:
            schedule_dict = {}
            gameInfo_dict = {}
            columns = row.find_all('td')
            if columns:
                schedule_dict['week'] = convertToNumber(columns[0].text)
                schedule_dict['day'] = columns[1].text
                schedule_dict['date'] = columns[2].text
                schedule_dict['year'] = convertToNumber(year)
                homeIndicator = columns[5].text
                if homeIndicator == '@':
                    schedule_dict['homeTeam'] = columns[6].text
                    schedule_dict['awayTeam'] = columns[4].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[8].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[7].text)
                else:
                    schedule_dict['homeTeam'] = columns[4].text
                    schedule_dict['awayTeam'] = columns[6].text
                    schedule_dict['homeTeamScore'] = convertToNumber(columns[7].text)
                    schedule_dict['awayTeamScore'] = convertToNumber(columns[8].text)
                gameInfo_dict['week'] = convertToNumber(columns[0].text)
                gameInfo_dict['year'] = convertToNumber(year)
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                url = columns[3].find('a')
                if url:
                    url = 'http://www.pro-football-reference.com' + url['href']
                    failed_game_info = True
                    browser = open_or_follow_link(logger, browser, 'open', url)
                    game_info = browser.find(id="game_info")
                    if game_info:
                        for each in game_info.find_all('tr'):
                            pair = each.find_all('td')
                            if pair:
                                failed_game_info = False
                                key = pair[0].text
                                value = convertToNumber(pair[1].text)
                                gameInfo_dict[cleanKey(key)] = convertToNumber(value)
                    if failed_game_info:
                        failed_dict = schedule_dict
                        failed_dict['row'] = index
                        failed_dict['href'] = url['href']
                        col_failed_game_info.insert(failed_dict)
                        gameInfo_dict['FAIL'] = True

                schedule_list.append(schedule_dict)
                gameInfo_list.append(gameInfo_dict)
        except:
            logger.exception(row)

    logger.debug('nfl_schedule.inert_many')

    schedule_ids = col_schedule.insert_many(schedule_list).inserted_ids
    
    logger.debug('mapping nfl_schedule.id to gameInfo_list')

    for index, schedule_id in enumerate(schedule_ids):
        if len(gameInfo_list[index].keys()) <= 2:
            logger.debug('Empty game_info: %s', schedule_id)
        gameInfo_list[index]['schedule_id'] = schedule_id

    logger.debug('game_info.insert_many')
    col_game_info.insert_many(gameInfo_list)

    logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(year)
예제 #52
0
# coding: utf-8
import re
from robobrowser import RoboBrowser

url = "http://www.qq.com/"
b = RoboBrowser(history=True)
b.open(url)

# 获取今日话题这个link
today_top = b.find(id="todaytop").a
print today_top["href"]

b.follow_link(today_top)

# 这个时候已经跳转到了今日话题的具体页面了

# 打印标题
title = b.select(".hd h1")[0]
print "*************************************"
print title.text
print "*************************************"

# 打印正文内容
print b.find(id="articleContent").text
예제 #53
0
import re
from robobrowser import RoboBrowser

url = 'http://itest.info/courses/2'
b = RoboBrowser(history=True)
b.open(url)

class_name = b.select('.headline h2')
print class_name[0].text

class_desc = b.select('.tag-box')
print class_desc[0].text

class_time = b.select('h4')
print class_time[0].text

teacher = b.select('.thumbnail-style h3')
print teacher[0].text

qq = b.find(text=re.compile('QQ'))
print qq

qq_group = b.find(text=re.compile('\+selenium'))
print qq_group

예제 #54
0

USERAGENTS ='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0'
session = req_session()
session.headers.update({'Referer': 'https://www.deviantart.com'})

browser = RoboBrowser(history=False, session=session, tries=2, user_agent=USERAGENTS)

print("Attempting to log in to deviantArt...")

browser.open('https://www.deviantart.com/users/login?ref=https%3A%2F%2Fwww.deviantart.com%2F&remember_me=1')
form = browser.get_forms()[1]
form['username'] = USERNAME
form['password'] = PSWD
#print(form)
if browser.find(text=re.compile("Login")):
    print('Compiled login fields form...')

browser.submit_form(form)

if browser.find(text=re.compile("The password you entered was incorrect")):
        print("Wrong password or username. Attempting to download anyway.")
        exit();
elif browser.find(text=re.compile("\"loggedIn\":true")):
        print("Logged in!")
else:
        print("Login unsuccessful. Attempting to download anyway.")
        exit();
browser.open('https://www.deviantart.com/messages/#view=deviantwatch')
page=browser.select('body')
예제 #55
0
def parseSeason(role, category, season, seasonTypes):
    """Parses every seasonType in a season at http://www.nfl.com/stats/categorystats for a given role/category/season
    doesnt follow any links
    some years dont have any info, but still return a page.
    These are loged with Exception('No teams found %s' % url)
    All data is stored in team_stats
    """
    logger = makeLogger(role.text + '_' + category.text + '_' + season.text, r'./logs_nflteamStat/')

    startTime = datetime.now()

    logger.debug('Starting %s %s %s', role.text, category.text, season.text)

    teamStat_list = []
    for seasonType in seasonTypes:
        if seasonType.text == "Season Type...":
            continue

        team_stats_query = {'year': convertToNumber(removeNewLine(season.text)),
            'seasonType': removeNewLine(seasonType.text),
            'role': removeNewLine(role.text),
            'category': removeNewLine(category.text)
        }

        if col_team_stats.find(team_stats_query).count():
            logger.debug('Already parsed %s', team_stats_query)
            continue

        wait = random.uniform(1.5,3.5)
        logger.debug('Waiting %f', wait)
        time.sleep(wait)

        logger.debug('Starting: %s', team_stats_query)
        url = 'http://www.nfl.com/stats/categorystats?' + 'archive=true&conference=null' + '&role=' + role['value']
        try:
            if role.text == "Offense":
                categoryUrl = '&offensiveStatisticCategory=' + category['value'] + '&defensiveStatisticCategory=null'
                
            elif role.text == "Defense":
                categoryUrl = '&offensiveStatisticCategory=null&defensiveStatisticCategory=' + category['value']
            else:
                raise Exception('Unsupported role: %s', role.text)
            
            url += categoryUrl
            url += '&season=' + season['value'] + '&seasonType=' + seasonType['value'] + '&tabSeq=2&qualified=false&Submit=Go'

            logger.debug('Opening: %s', url)
            browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
            browser = open_or_follow_link(logger, browser, 'open', url)
            result = browser.find(id="result")

            tries = 0
            # sometimes when using slow proxies nfl.com returns 200 without the whole page being loaded
            while not result:
                if tries > 10:
                    raise Exception('No teams found %s' % url)
                elif tries > 0:
                    time.sleep(random.uniform(5, 7))
                tries += 1
                logger.debug('No result-tries: %d', tries)
                browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
                browser = open_or_follow_link(logger, browser, 'open', url)
                result = browser.find(id="result")

            tbodies = result.find_all("tbody")
            if len(tbodies) != 2:
                raise Exception("error parsing result")
            tableKey = tbodies[0]
            tableKeyRows = tableKey.find_all("tr")
            topTableKeys = []
            if len(tableKeyRows) == 1:
                tableKey = tableKey.find_all("th")
            elif len(tableKeyRows) == 2:
                topTableColumns = tableKeyRows[0].find_all("th")
                for topTableColumn in topTableColumns:
                    for _ in range(int(topTableColumn['colspan'])):
                        topTableKeys.append(topTableColumn.text)
                tableKey = tableKeyRows[1].find_all("th")
            else:
                raise Exception('To many header rows found')

            tableItems = tbodies[1]
            tableItems = tableItems.find_all("td")

            tableColumn = 0
            teamStatDict = {}
            for tableIndex, tableItem in enumerate(tableItems):
                if tableColumn == 0:
                    logger.debug('Row %d of %d', tableIndex, len(tableItems))
                    tableColumn += 1
                    continue

                if tableColumn == 1:
                    teamStatDict['team'] = removeNewLine(tableItem.text)
                    teamStatDict['year'] = int(removeNewLine(season.text))
                    teamStatDict['seasonType'] = removeNewLine(seasonType.text)
                    teamStatDict['role'] = removeNewLine(role.text)
                    teamStatDict['category'] = removeNewLine(category.text)
                    tableColumn += 1
                    continue

                if topTableKeys and topTableKeys[tableColumn]:
                    key = topTableKeys[tableColumn] + '_' + tableKey[tableColumn].text
                else:
                    key = tableKey[tableColumn].text
                key = cleanKey(removeNewLine(key))
                value = convertToNumber(removeNewLine(tableItem.text))
                teamStatDict[key] = value

                tableColumn += 1
                if tableColumn >= len(tableKey):
                    teamStat_list.append(teamStatDict)
                    teamStatDict = {}
                    tableColumn = 0
        except:
            logger.exception('row fail')

    try:
        if teamStat_list:
            logger.debug('Bulk Creating teamStat_list')
            col_team_stats.insert_many(teamStat_list)
    except:
        logger.exception('insert_many error')

    logger.debug('parseSeason time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(role.text + '_' + category.text)
예제 #56
0
class RSClient():

    def __init__(self):
        self._create_browser()

    def _create_browser(self):
        url = 'http://recordsearch.naa.gov.au/scripts/Logon.asp?N=guest'
        self.br = RoboBrowser(parser='lxml', user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36')
        self.br.open(url)
        form = self.br.get_form(id='t')
        self.br.submit_form(form)

    def _open_url(self, url):
        '''
        RecordSearch inserts a page that needs to have an embedded form
        automatically submitted before you get what you actually want.
        '''
        self.br.open(url)
        form = self.br.get_form(id='t')
        self.br.submit_form(form)

    def _get_details(self, entity_id):
        '''
        Given an id retrieve the element containing the item details.
        '''
        if (not entity_id and self.entity_id) or (entity_id == self.entity_id):
            details = self.details
        else:
            url = '{}{}'.format(RS_URLS[self.entity_type], quote_plus(entity_id))
            self._open_url(url)
            details = self.br.find('div', 'detailsTable')
            if details:
                self.entity_id = entity_id
                self.details = details
            else:
                raise UsageError('No details found for {}'.format(entity_id))
        return details

    def _get_cell(self, label, entity_id):
        details = self._get_details(entity_id)
        try:
            cell = (
                details.find(text=re.compile(label))
                .parent.parent.findNextSiblings('td')[0]
            )
        except (IndexError, AttributeError):
            # Sometimes the cell labels are inside an enclosing div,
            # but sometimes not. Try again assuming no div.
            try:
                cell = (
                    details.find(text=re.compile(label))
                    .parent.findNextSiblings('td')[0]
                )
            except (IndexError, AttributeError):
                cell = None
        return cell

    def _get_value(self, label, entity_id):
        cell = self._get_cell(label, entity_id)
        try:
            value = ' '.join([string for string in cell.stripped_strings])
        except AttributeError:
            value = None
        return value

    def _get_formatted_dates(self, label, entity_id, date_format):
        try:
            date_str = self._get_value(label, entity_id)
        except AttributeError:
            dates = {'date_str': date_str, 'start_date': None, 'end_date': None}
        else:
            if date_str:
                dates = utilities.process_date_string(date_str)
                if date_format == 'iso':
                    formatted_dates = {
                        'date_str': date_str,
                        'start_date': utilities.convert_date_to_iso(dates['start_date']),
                        'end_date': utilities.convert_date_to_iso(dates['end_date']),
                    }
                elif date_format == 'obj':
                    formatted_dates = dates
            else:
                formatted_dates = {'date_str': None, 'start_date': None, 'end_date': None}
        return formatted_dates

    def _get_relations(self, label, entity_id, date_format):
        cell = self._get_cell(label, entity_id)
        relations = []
        if cell is not None:
            for relation in cell.findAll('li'):
                try:
                    date_str = relation.find('div', 'dates').string.strip()
                except AttributeError:
                    date_str = ''
                    dates = {'date_str': '', 'start_date': None, 'end_date': None}
                else:
                    dates = utilities.process_date_string(date_str)
                if date_format == 'iso':
                    formatted_dates = {
                        'date_str': date_str,
                        'start_date': utilities.convert_date_to_iso(dates['start_date']),
                        'end_date': utilities.convert_date_to_iso(dates['end_date']),
                    }
                elif date_format == 'obj':
                    formatted_dates = dates
                details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings]
                try:
                    identifier = details[0]
                    title = details[1][2:]
                except IndexError:
                    identifier = details[0]
                    title = details[0]
                relations.append({
                    'date_str': formatted_dates['date_str'],
                    'start_date': formatted_dates['start_date'],
                    'end_date': formatted_dates['end_date'],
                    'identifier': identifier,
                    'title': title
                })
        else:
            relations = None
        return relations

    def get_digitised_pages(self, entity_id=None):
        '''
        Returns the number of pages (images) in a digitised file.
        Note that you don't need a session id to access these pages,
        so there's no need to go through get_url().
        '''
        # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id)
        url = 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id)
        br = RoboBrowser(parser='lxml')
        br.open(url)
        try:
            pages = int(br.find('span', attrs={'id': "lblEndPage"}).string)
        except AttributeError:
            pages = 0
        return pages

    def _get_advanced_search_form(self):
        # Added header 10 June 2015 -- otherwise causes error
        self.br.session.headers.update({'Referer': 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/BasicSearch.aspx'})
        self.br.open('http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/SearchScreens/AdvSearchItems.aspx')
        search_form = self.br.get_form(id="formSNRMaster")
        return search_form
예제 #57
0
def parseWeek(year, week):
    """Parsing a specific week at http://nflweather.com/week/{}/Week-{}
    Follows all detial links, which is where must of the data is scraped.
    Scrapes weather, and stadium enough per week, and stores them in their respective collections
    """
    logger = makeLogger(str(year) + '_' + str(week), r'./logs_nflWeather/')

    startTime = datetime.now()

    logger.debug('Starting %d %d', year, week)

    weather_list = []
    stadium_list = []

    if col_weather_info.find({'year': year, 'week': week}).count():
        logger.debug('Already parsed %d %d', year, week)
        return None

    wait = random.uniform(1.5,3.5)
    logger.debug('Waiting %f', wait)
    time.sleep(wait)

    logger.debug('Opening main page')
    browser = RoboBrowser(history=False,  parser='html5lib', user_agent=get_user_agent(logger), timeout=10)
    browser = open_or_follow_link(logger, browser, 'open', "http://nflweather.com/week/{}/Week-{}".format(year, week))

    data = browser.find(class_="footable")
    rows = data.find_all('tr')

    for index, row in enumerate(rows):
        logger.debug('Row %d of %d', index, len(rows))
        weatherInfo = {'year': year, 'week': week}
        stadiumInfo = {'year': year, 'week': week}

        try:
            columns = row.find_all('td')
            if columns:
                weatherInfo['weatherPicAlt'] = columns[8].find('img')['alt']
                weatherInfo['weatherText'] = columns[9].text.strip()
                weatherInfo['shortWind'] = columns[10].text
                details = columns[12]
                detialsLink = 'http://nflweather.com' + details.find('a')['href']
                wait = random.uniform(.5, 2.5)
                logger.debug('Waiting to follow_link %f', wait)
                time.sleep(wait)
                logger.debug('Following link')
                browser = open_or_follow_link(logger, browser, 'open', detialsLink)
                gameTime = browser.find('strong').text.split('-')[0].split(':', 1)[1].strip()
                awayTeam = browser.find_all(class_='g-away')[1].find('a').text.replace('  ', ' ').strip()
                homeTeam = browser.find_all(class_='g-home')[1].find('a').text.replace('  ', ' ').strip()
                spans = browser.find_all(class_='span5')
                if len(spans) != 2:
                    raise Exception('to many spans')

                weatherItems = spans[0].find_all('p')
                stadiumItems = spans[1].find_all('p')

                index = spans[0].text.find('Temperature:')
                weatherCondition = spans[0].text[:index].strip()

                for each in weatherItems:
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        weatherInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())
                
                for index, each in enumerate(stadiumItems):
                    split = each.text.strip().split(':')
                    if len(split) == 2:
                        if split[0] == 'Surface':
                            stadiumInfo['stadium'] = stadiumItems[index-1].text.strip()
                        stadiumInfo[cleanKey(split[0].strip())] = convertToNumber(split[1].strip())

                #find nfl_schedule, update gameTime, hoepfully result as id, insert id into both info dicts, append to _list
                schedule_query = {'year': year, 'week': week, 'homeTeam': homeTeam, 'awayTeam': awayTeam}
                schedule_doc = col_schedule.find(schedule_query)
                if schedule_doc.count() != 1:
                    error_docs = str(schedule_query) + ' | ' + str(weatherInfo) + ' | ' + str(stadiumInfo)
                    raise Exception("nfl_scedule doc not found " + error_docs)
                result = col_schedule.update_one(schedule_query, {'$set': {'dateTime': gameTime}})
                schedule_id = schedule_doc[0]['_id']
                weatherInfo['schedule_id'] = schedule_id
                stadiumInfo['schedule_id'] = schedule_id
                weather_list.append(weatherInfo)
                stadium_list.append(stadiumInfo)
        except:
            logger.exception(row)

    try:
        logger.debug('Bulk Creating weather_list')
        col_weather_info.insert_many(weather_list)
        logger.debug('Bulk Creating stadium_list')
        col_stadium_info.insert_many(stadium_list)
    except:
        logger.exception('insert_many error')
    logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime))

    closeLogger(str(year) + '_' + str(week))
예제 #58
0
파일: dagr.py 프로젝트: orangepole/dagr
class Dagr:
        """deviantArt gallery ripper class"""

        NAME = basename(__file__)
        __version__="0.60"
        MAX_DEVIATIONS = 1000000 # max deviations

        def __init__(self):
                # Internals
                self.browser = None
                self.errors_count = dict()

                # Configuration
                self.username = ""
                self.password = ""
                self.overwrite = False
                self.reverse = False
                self.testOnly = False
                self.verbose = False

                # Current status
                self.deviant = ""

        def start(self):
                if not self.browser:
                        # Set up fake browser
                        self.set_browser()
                # Always run login
                self.login()

        def set_browser(self):
                USERAGENTS = (
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1',
                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101',
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                    'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9',
                    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5',
                    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/6.0',
                    'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1) Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)'
                    )
                session = req_session()
                session.headers.update({'Referer': 'http://www.deviantart.com/'})

                self.browser = RoboBrowser(history=False, session=session, tries=3, user_agent=random.choice(USERAGENTS))

        def login(self):
                if not (self.username and self.password):
                        return
                print("Attempting to log in to deviantArt...")
                self.browser.open('https://www.deviantart.com/users/login?ref=http%3A%2F%2Fwww.deviantart.com%2F&remember_me=1')
                form = self.browser.get_forms()[1]
                form['username'] = self.username
                form['password'] = self.password
                self.browser.submit_form(form)

                if self.browser.find(text=re.compile("The password you entered was incorrect")):
                        print("Wrong password or username. Attempting to download anyway.")
                elif self.browser.find(text=re.compile("\"loggedIn\":true")):
                        print("Logged in!")
                else:
                        print("Login unsuccessful. Attempting to download anyway.")

        def get(self, url, file_name = None):
                if file_name is not None and (self.overwrite == False) and (path_exists(file_name)):
                        print(file_name + " exists - skipping")
                        return
                #TODO Test robobrowser retries and exceptions
                self.browser.open(url)

                if file_name is None:
                        return str(self.browser.parsed)
                else:
                        # Open our local file for writing
                        local_file = open(file_name, "wb")
                        #Write to our local file
                        local_file.write(self.browser.response.content)
                        local_file.close()

        def find_link(self, link):
                filelink = None
                mature_error = False
                self.browser.open(link)
                # Full image link (via download link)
                img_link = self.browser.get_link(text=re.compile("Download( (Image|File))?"))
                if img_link and img_link.get("href"):
                        self.browser.follow_link(img_link)
                        filelink = self.browser.url
                else:
                        if self.verbose:
                                print("Download link not found, falling back to direct image")
                        # Fallback 1: try meta (filtering blocked meta)
                        filesearch = self.browser.find("meta", {"name":"og:image"})
                        if filesearch:
                                filelink = filesearch['content']
                                if basename(filelink).startswith("noentrythumb-"):
                                        filelink = None
                                        mature_error = True
                        if not filelink:
                                # Fallback 2: try collect_rid, full
                                filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*full")})
                                if not filesearch:
                                # Fallback 3: try collect_rid, normal
                                        filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*normal")})
                                if filesearch:
                                        filelink = filesearch['src']

                        if not filelink:
                                if mature_error:
                                        raise DagrException("probably a mature deviation")
                                else:
                                        raise DagrException("all attemps to find a link failed")

                filename = basename(filelink)
                return (filename, filelink)

        def handle_download_error(self, link, e):
                error_string = str(e)
                print("Download error (" + link + ") : " + error_string)
                if error_string in self.errors_count:
                        self.errors_count[error_string] += 1
                else:
                        self.errors_count[error_string] = 1

        def deviant_get(self, mode):
                print("Ripping " + self.deviant + "'s " + mode + "...")
                pat = "http://[a-zA-Z0-9_-]*\.deviantart\.com/art/[a-zA-Z0-9_-]*"
                modeArg = '_'
                if mode.find(':') != -1:
                        mode = mode.split(':',1)
                        modeArg = mode[1]
                        mode = mode[0]

                #DEPTH 1
                pages = []
                for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24):
                        html = ""
                        url = ""

                        if mode == "favs":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/?catpath=/&offset=" + str(i)
                        elif mode == "collection":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/" + modeArg + "?offset=" + str(i)
                        elif mode == "scraps":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=scraps&offset=" + str(i)
                        elif mode == "gallery":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=/&offset=" + str(i)
                        elif mode == "album":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/" + modeArg + "?offset=" + str(i)
                        elif mode == "query":
                                url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?q=" + modeArg + "&offset=" + str(i)
                        else:
                                continue

                        html = self.get(url)
                        prelim = re.findall(pat, html, re.IGNORECASE|re.DOTALL)

                        c = len(prelim)
                        for match in prelim:
                                if match in pages:
                                        c -= 1
                                else:
                                        pages.append(match)

                        done = re.findall("(This section has no deviations yet!|This collection has no items yet!)", html, re.IGNORECASE|re.S)

                        if len(done) >= 1 or c <= 0:
                                break

                        print(self.deviant + "'s " +  mode + " page " + str(int((i/24)+1)) + " crawled...")

                if not self.reverse:
                        pages.reverse()

                if len(pages) == 0:
                        print(self.deviant + "'s " + mode + " had no deviations.")
                        return 0
                else:
                        try:
                                da_make_dirs(self.deviant + "/" + mode)
                                if (mode == "query") or (mode == "album") or (mode == "collection"):
                                    da_make_dirs(self.deviant + "/" + mode + "/" + modeArg)
                        except Exception as e:
                                print(str(e))
                        print("Total deviations in " + self.deviant + "'s gallery found: " + str(len(pages)))

                ##DEPTH 2
                counter2 = 0
                for link in pages:
                        counter2 += 1
                        if self.verbose:
                                print("Downloading " + str(counter2) + " of " + str(len(pages)) + " ( " + link + " )")
                        filename = ""
                        filelink = ""
                        try:
                                filename,filelink = self.find_link(link)
                        except (KeyboardInterrupt, SystemExit):
                                raise
                        except Exception as e:
                                self.handle_download_error(link, e)
                                continue

                        if self.testOnly == False:
                                if (mode == "query") or (mode=="album") or (mode == "collection"):
                                        self.get(filelink, self.deviant + "/" + mode + "/" + modeArg + "/" + filename)
                                else:
                                        self.get(filelink, self.deviant + "/" + mode + "/" + filename)
                        else:
                                print(filelink)

                print(self.deviant + "'s gallery successfully ripped.")

        def group_get(self, mode):
                if mode == "favs":
                        strmode  = "favby"
                        strmode2 = "favourites"
                        strmode3 = "favs gallery"
                elif mode == "gallery":
                        strmode  = "gallery"
                        strmode2 = "gallery"
                        strmode3 = "gallery"
                else:
                        print("?")
                        sys.exit()
                print("Ripping " + self.deviant + "'s " + strmode2 + "...")

                folders = []

                insideFolder = False
                #are we inside a gallery folder?
                html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/')
                if re.search(strmode2 + "/\?set=.+&offset=", html, re.IGNORECASE|re.S):
                        insideFolder = True
                        folders = re.findall(strmode + ":.+ label=\"[^\"]*\"", html, re.IGNORECASE)

                #no repeats
                folders = list(set(folders))

                i = 0
                while not insideFolder:
                        html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/?offset=' + str(i))
                        k = re.findall(strmode + ":" + self.deviant + "/\d+\"\ +label=\"[^\"]*\"", html, re.IGNORECASE)
                        if k == []:
                                break
                        flag = False
                        for match in k:
                                if match in folders:
                                        flag = True
                                else:
                                        folders+=k
                        if self.verbose:
                                print("Gallery page " + str(int((i/10) + 1)) + " crawled...")
                        if flag:
                                break
                        i += 10

                #no repeats
                folders = list(set(folders))

                if len(folders) == 0:
                        print(self.deviant + "'s " +  strmode3 + " is empty.")
                        return 0
                else:
                        print("Total folders in " + self.deviant + "'s " + strmode3 + " found: " + str(len(folders)))

                if self.reverse:
                        folders.reverse()

                pat = "http:\\/\\/[a-zA-Z0-9_-]*\.deviantart\.com\\/art\\/[a-zA-Z0-9_-]*"
                pages = []
                for folder in folders:
                        try:
                                folderid = re.search("[0-9]+",folder,re.IGNORECASE).group(0)
                                label = re.search("label=\"([^\"]*)",folder,re.IGNORECASE).group(1)
                        except:
                                continue
                        for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24):
                                html = self.get("http://" + self.deviant.lower() + ".deviantart.com/" + strmode2 + "/?set=" + folderid + "&offset=" + str(i - 24))
                                prelim = re.findall(pat, html, re.IGNORECASE)
                                if not prelim:
                                        break
                                for x in prelim:
                                        p = str(re.sub(r'\\/','/',x))
                                        if p not in pages:
                                                pages.append(p)
                                if self.verbose:
                                        print("Page " + str(int((i/24) + 1)) + " in folder " + label + " crawled...")

                        if not self.reverse:
                                pages.reverse()

                        try:
                                if mode == "favs":
                                        da_make_dirs(self.deviant + "/favs/" + label)
                                elif mode == "gallery":
                                        da_make_dirs(self.deviant + "/" + label)
                        except Exception as err:
                                print(err)
                        counter = 0
                        for link in pages:
                                counter += 1
                                if self.verbose:
                                        print("Downloading " +  str(counter) +  " of " + str(len(pages)) +  " ( " + link + " )")
                                filename = ""
                                filelink = ""
                                try:
                                        filename,filelink = self.find_link(link)
                                except (KeyboardInterrupt, SystemExit):
                                        raise
                                except Exception as e:
                                        self.handle_download_error(link, e)
                                        continue

                                if self.testOnly == False:
                                        if mode == "favs":
                                                self.get(filelink, self.devianti + "/favs/" + label + "/" + filename)
                                        elif mode == "gallery":
                                                self.get(filelink, self.deviant + "/" + label + "/" + filename)
                                else:
                                        print(filelink)

                print(self.deviant + "'s " + strmode3 + " successfully ripped.")

        def print_errors(self):
                if len(self.errors_count):
                        print("Download errors count:")
                        for error, count in self.errors_count.iteritems():
                                print("* " + error + " : " + str(count))
예제 #59
-1
def get_medicare_email(request, mmg):
    """

    :param request:
    :param mmg:
    :return:
    """

    mmg_back = mmg
    mmg_back['status'] = "FAIL"
    mmg_back['mmg_email'] = ""

    PARSER = settings.BS_PARSER
    if not PARSER:
        if settings.DEBUG:
            print('Default Parser for BeautifulSoup:', 'lxml')
        PARSER = 'lxml'

    # Call the default page
    rb = RoboBrowser()

    # Set the default parser (lxml)
    # This avoids BeautifulSoup reporting an issue in the console/log
    rb.parser = PARSER

    target_page = "https://www.mymedicare.gov/myaccount.aspx"
    # Open the form to start the login
    rb.open(target_page)
     # Get the form content
    page = rb.parsed

    if settings.DEBUG:
        print("===============================")
        print("on page:", rb.url)
        print("MyAccount:", page)


    my_email = rb.find("div",
                       attrs={"class":"ctl00_ctl00_ContentPlaceHolder1_ctl00_ctl00_ctl00_ctl01_UserInfo_pnlEmailSettings"})

    if settings.DEBUG:
        print("What email information:", my_email)
    for addr in my_email:
        mail_addr = my_email.find("div",
                       attrs={"class": "myaccount-data"})
        mail_address = mail_addr.text

    mmg_back['mmg_email'] = mail_address
    if rb.url == target_page:
        mmg_back['url'] = rb.url
        mmg_back['status'] = "OK"


    if settings.DEBUG:
        print("Email:", mail_address)
        print("url:", rb.url)

    return mmg_back