def getDomFromFile(url):
    #html = requests.get("http://www.autolanka.com/Buy.asp").content
    html=open('index'+str(url),'r')
    dom = lxml.html.fromstring(html.read())
    #data=minePage(dom)

    return dom
示例#2
0
    def get_email(self, url, search_word):
        self._clear_variable()

        if url == 'nan':
            self.emails = ''
            return

        if fnmatch.fnmatch(url, '*.txt') or fnmatch.fnmatch(url, '*.pdf'):
            self.emails = ''
            return

        try:
            html = urlopen(url)
            soup = BeautifulSoup(html.read(), "lxml")
            email = soup.find_all(string=re.compile(search_word))
            self._set_emails(email)
            if len(self.emails) > 0:
                print('{}という文字列を発見しました。\nemails: {}'.format(
                    search_word, self.emails))
            else:
                print('{}を含む文字列は見つかりませんでした。'.format(search_word))

        except urllib.error.HTTPError as e:
            print(e)
            if e.code == 403:
                self.emails = None
            else:
                self.emails = ''
示例#3
0
def download(url,num_retries=2,headers={'User_agent':'wswp'}):
    print 'Downloading:'+url
    headers=headers
    # headers={
    #    'cookie':'ali_apache_id=10.181.239.59.1494788661790.629693.9; ali_beacon_id=10.181.239.59.1494788661790.629693.9; __utma=3375712.567060824.1494788650.1494788798.1494788798.1; __utmc=3375712; __utmz=3375712.1494788798.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=FF6Y4EE1N2-GOQKM0V4V8OX81KS4MMQ3-TP303P2J-SNN; _mle_tmp0=eNrz4A12DQ729PeL9%2FV3cfUxiK7OTLFScnMzizRxdTX0M9J19w%2F09jUIMwmz8I%2BwMPQONvH1DTTWDQkwNjAOMPLSDfbzU9JJLrEyNLE0MbewNLYwMDUz1ElMRhPIrbAyqI0CAE1jHIM%3D; aep_history=keywords%5E%0Akeywords%09%0A%0Aproduct_selloffer%5E%0Aproduct_selloffer%0932765891920%0932659135793; aep_common_f=lloB6SH9JkijCM0tZf5kF3vyLv3h17v3UGN0yKJY3eGouhvEFBCS8A==; cna=7TiaEawi8UUCAdz5Y7Jfecem; acs_usuc_t=acs_rt=9fa376ab0d934f49897582cda74a6a48&x_csrf=1dhm5ajlu9pss; xman_us_t=x_lid=cn1520615299vyjy&sign=y&x_user=sY7/8Mu/TJ74CnXJLTJLG+uzUZKMe5Udm53rpggEULQ=&ctoken=hku5awqv39rt&need_popup=y&l_source=aliexpress; xman_f=r9aP0o7m4kQFmXkQnDDVvHtloUJDl4TJtul01V5pE/TGopjb7kv3ERmNWw/bl4AkL2PUKsbtm0P9uzA1VpN9yTOLXjsCP492Tp0lMn+dZp6lLhPTMTEPlFPB9+Df+xVeZnFC4bm1nRL4VukJ85ff5E6t6GRqQNKTT+rFlACpAt8Xml6pQqUVNwXg2DwEYaUKhuQuJtUCJguhpCa9xCyXT4MffiUY7ExmZH0NG4eesAgpds2lCBCmS6GKkR1JRECrRYDFGYnVYO72CIy4so0cERX0HEVsdrCvu9pBrBOsnUxDLXCvUQWUbfIeT/Pf+rJT49UCdnH0DKQZftBBaUmJL0ZpSZc/5p9+RN8ZPGXplbRkFPIqThB/THcutxz5OKb8K5mSRANDbAR6utSxkYdfPw==; _ga=GA1.2.567060824.1494788650; _gid=GA1.2.511798973.1494791972; _gat=1; xman_us_f=x_l=1&x_locale=en_US&no_popup_today=n&x_user=CN|zheng|quanshi|cnfm|230715779&zero_order=y&last_popup_time=1494791355822; aep_usuc_f=site=glo&region=US&b_locale=en_US&iss=y&s_locale=zh_CN&isfm=y&x_alimid=230715779&c_tp=USD; intl_locale=en_US; intl_common_forever=y79FmkBQVBQqJv1LtS4vunejUJ855apY0IOmvml/PRSXt6uEoU8RZg==; l=Ajk50FxeJA3yhOmq8aD8iPi8ya4RcS34; isg=AvPzpm8uroKGwmLWP-mLS0CTgvd5y4fq6arxoKWSeJJJpBNGLPgXOlE-KGKx; ali_apache_track=mt=2|ms=|mid=cn1520615299vyjy; ali_apache_tracktmp=W_signed=Y; xman_t=oWB8qjX+m/FChYzepnxuxryFhmGnbqQ023tWzmrFPg31C97Flxcq69qPSvczwM3a+vYgGwjxlyDEUqr1uQKvfSk2yxYTlIXjrfq1qKduCQqLIiofcEw0m34tbPSH0b25clkG0+uN3pj+pI7GcStuSq60x5OEmwFYzxrucHqx+Lw6rdJo6C6cMWZNa98KFo7mVIv9FDorv/rLbURUmXcRtKpzakFP1PQQuM69/LXfW9eltTejIU4ssITXciaL7JBxU0DkVvIdop4ZFwLG9P6TA8CUQb7m3MSFvhW/zdgztKyg7ZgIHUh6+p5FRdLOA3UKSB3+kTw+pQJ4xr1tCTCKBHzTuPCa7RgITEUR7n7LE67o5FtMOy6EZmoZZggReQG5Vo89imAnwnLlsvdjezMswKh87jvo66bJQ423xlN/yUb2n8kO+yeAs/0FeSXltceWR1R50qFg2HO8cEz+QChALI5KC7rzHXLZPSXDt7EzvmoIuKo8UI6Db4Iuefl0t0UhRonofgoh3meoZFidDI2m3ZwfMBTka0hbRoK/fWNvW9FOeQTOzYpBtoFRCd1x1aNmzuF+i6lMvH32E57KJXo/dN4aNF5O6ZDSnMW7hZe75T9pWX1MNIiFmGget/NY3Cx96tgwr7ekv2vseN+4HIo20a5027PyOIoq',
    # }
    request=urllib2.Request(url,headers=headers)
    try:
        response=urllib2.urlopen(request)
        html=urllib2.urlopen(request).read()
        if response.info().get('Content-Encoding')=='gzip':
            html=gzip.GzipFile(fileobj=StringIO.StringIO(html),mode="r")
            try:
                html=html.read()   #.decode('gbk').encode('utf-8')
            except IOError as e1:
                html=urllib2.urlopen(request).read()    #针对 amazon 返回信息 时有错误
    except urllib2.URLError as e:
        print 'Downloading error:',e.reason
        html=None
        if num_retries > 0:
            if hasattr(e,'code') and 500<=e.code<600:
                return download(url,num_retries-1,headers)
    
    #print html
    return html
示例#4
0
def match_walletID_bitaddr(ID_txhash, address_type):
    global idx
    socket.setdefaulttimeout(3)
    for walletId in ID_txhash.keys():
        idx += 1
        print(idx)
        try:
            txhashes = ID_txhash[walletId]
        except Exception as e:
            continue

        for txhash in txhashes:
            try:
                request = urllib.request.urlopen('http://www.qukuai.com/search/zh-CN/BTC/' + txhash)
                html = request.read()
                request.close()
                address = get_address(html, address_type)
                # print('method2 ', walletId, address)
            except Exception as e:
                try:
                    html = urllib.request.urlopen('https://blockchain.info/rawtx/' + txhash)
                    hjson = json.loads(html.read())
                    address = parse_transaction(hjson, address_type)
                    # print('method1 ', address)
                except Exception as e:
                    print('get address failed')
                    continue

            if walletId not in walletId_bitaddr:
                # print('1 ', walletId, address)
                walletId_bitaddr[walletId] = address
            else:
                # print( 'not 1 ', walletId, address)
                walletId_bitaddr[walletId].extend(address)
def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()

        round = roundPage.cssselect(
            "li[id='tpRound'] a")[0].text_content().replace(
                "round ", "").replace(" Rankings", "").strip()
        print "Round: " + round

        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")

        #remove the "next page" link
        del pageLinks[-1]

        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL

            scrapePage(linkURL, round)

        calculateExtraStats(round)
示例#6
0
def gettitle(url):

    requests.packages.urllib3.disable_warnings()
    req = request.Request(url)
    try:
        re = request.urlopen(req)
        html = urlopen(url)
        # 解析返回包的内容
        #捕获异常,目标标签在网页中缺失
        try:
            soup = BeautifulSoup(html.read(), 'lxml')
            title = soup.title.text
            tfw = open("title.txt", "a")
            tfw.write(str(soup.title.text) + "\n")
            tfw.close()
            ufw = open("url.txt", "a")
            ufw.write(str(re.url) + "\n")
            ufw.close()
            # 要加close不然无法写入
        except AttributeError as e:
            print(url + " " + "no title")
            efw = open("eception.txt", "a")
            efw.write(url + " no tile" + "\n")
    except error.HTTPError as e:
        print(e.code)
        efw = open("eception.txt", "a")
        efw.write(url + " " + str(e.code) + "\n")
    except error.URLError as e:
        print(e.reason)
        efw = open("eception.txt", "a")
        efw.write(url + " " + str(e.reason) + "\n")
 def get_html(self):
     try:
         html = urllib2.urlopen(URL)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()
 def get_html(self):
     try:
         html = urllib2.urlopen(URL)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()
示例#9
0
def test1():
    j = json.loads('{"one" : "1", "two" : "2", "three" : "3"}')

    html = urlopen(
        "http://www.czce.com.cn/portal/DFSStaticFiles/Future/2017/20171026/FutureDataDaily.xls"
    )
    data = html.read()
    print(data)
    return
示例#10
0
 def getHtml(self, url):
     if (self.testUrl(url) is True):
         html = urllib.request.urlopen(url)
         mybytes = html.read()
         mystr = mybytes.decode("utf8")
         html.close()
         return mystr
     else:
         return None
示例#11
0
def yuandaima(ss):
    url = ss
    headers1 = {'GET': url,
                'Host': "www.icpcw.com",
                'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:28.0) Gecko/20100101 Firefox/28.0",
                'Referer': url}
    req = urllib.request.Request(url, headers=headers1)
    html = urllib.request.urlopen(req)
    scode = html.read().decode('utf-8', 'ignore')
    return scode
示例#12
0
def getSeniority(linkList):
    myList = []
    for link in linkList:
        html = urlopen(link)
        bs = BeautifulSoup(html.read(), 'html.parser')

        seniority = bs.find(
            'div', {'col star-section text-center active'}).findNext('p')
        myList.append(seniority.get_text())
    return myList
示例#13
0
 def __init__(self, url):
     print("load codeforces contest %s" % url)
     base = urlparse(url).netloc
     html = request.urlopen(url)
     self.dom = lxml.html.fromstring(html.read())
     self.contest_id = CFContest.get_contest_id(url)
     self.pdf_name = "CF" + self.contest_id + ".pdf"
     self.problems = []
     for problem_a_tag in self.dom.xpath('//table[@class="problems"]/tr[position() > 1]/td[1]/a'):
         self.problems.append(CFProblem("https://" + base + problem_a_tag.attrib['href']))
 def get_all_functions(self, passedurl, topics):
     '''open the function page for parsing'''
     html = urllib.urlopen(passedurl)
     html = html.read()
     maintree   = etree.parse(StringIO(html), self.parser)
     mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data
     if self.url=='http://docs.scipy.org/doc/scipy/reference/':
         self.scrape_section(mainContent[0], topics, scipy_first=True)
     else:
         self.scrape_section(mainContent[0], topics)
示例#15
0
 def get_html(self, url):
     opener = urllib2.build_opener()
     # agence.santemontreal.qc.ca seems to prohibit access (403) to "custom" http agents (
     # like urllib2 one) ; by forcing User-agent we workaround the problem:
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     try:
         html = opener.open(url)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()
 def get_html(self, url):
     opener = urllib2.build_opener()
     # agence.santemontreal.qc.ca seems to prohibit access (403) to "custom" http agents (
     # like urllib2 one) ; by forcing User-agent we workaround the problem:
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     try:
         html = opener.open(url)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()
 def request(self, url, params={}, timeout=180):
     error = None
     for x in range(0, settings.http_tries):
         try:
             if params:
                 params = urllib.urlencode(params)
                 html = urllib2.urlopen(url, params, timeout)
             else:
                 html = urllib2.urlopen(url)
             return html.read()
         except Exception as e:
             error = e
     raise error
示例#18
0
def getTitleAll(url, t1, t2, t3):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.body.h1
        price = bsObj.findAll(t1, attrs={t2: t3})
        print(title.get_text())
        for el in price:
            print(el.get_text())
    except AttributeError as e:
        return None
    return price
示例#19
0
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.body.h1
        price = bsObj.findAll("span", attrs={"class": "cost"})
        print(title.get_text())
        for el in price:
            print(el.get_text())
    except AttributeError as e:
        return None
    return price
示例#20
0
def search(s):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }

    req = request.Request('https://baike.baidu.com/item/' + quote(s, 'utf-8'),
                          headers=headers)
    html = urlopen(req)
    bsObj = BeautifulSoup(html.read(), "html.parser")
    bs = bsObj.find_all(name='div', attrs={'class': 'para'})
    content = ""
    for i in bs:
        content = f'{content}{i.text}'
    return content
示例#21
0
文件: post.py 项目: demid5111/nlp
def expert_prepare(_url):
    dictionary = {}

    db = MySQLdb.connect(host='localhost', user='******',passwd='123qwe', db='infoport', charset='utf8', init_command='SET NAMES UTF8')
    cursor = db.cursor()
    cursor.execute('select interest, article_id from exbd')
    result = cursor.fetchall()
    i = 0
    k = 0
    listkeys = []
    dictkeys = {}
    for record in result:
        if record[i+1] == _url:#!=
            dictkeys[k] = record[i]
            k=k+1
            #listkeys.append(record[i])

    dictionary['keyword'] = dictkeys
            #dictionary['keyword'] = dictkeys.get('keys')
            #dictionary['keyword'] = listkeys


    #print dictionary['keyword']


    html = urllib.urlopen(_url)
    doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore'))
    post = doc.cssselect('div.main .person-appointment-title')[0]
    dictionary['pos'] = post.text  #.encode('utf-8')
    academictitle = doc.cssselect('div.main .person-appointment-title')[0]
    dictionary['academic_title'] = academictitle.text  #.encode('utf-8')
    fio = doc.cssselect('div.footer__breadcrumbs .b ')[0]  #ФИО
    dictionary['fio'] = fio.text  #.encode('utf-8')
    items = doc.cssselect('div.g-pic')
    for item in items:
        image = item.get('style')
    s = image.split("'")
    page = 'http://www.hse.ru' + s[1]
    person_id = page.split("/")
    dictionary['person_id'] = person_id[6]
    #print page#адрес страницы, где находится фотография
    place = doc.cssselect('div.main .person-appointment-title + .link')
    #dictionary['place'] = place[0].text
    #print place[1].text #вывод ГОРОДА
    dictionary['photo'] = page
    #json_data = json.dumps(dictionary)
    #print json_data
    return dictionary
示例#22
0
文件: main.py 项目: melbaa/ircfw
 def use(self, rawcommand):
     if not len(rawcommand):
         return
     cooked = urllib.parse.urlencode({"search": rawcommand})
     html = urllib.request.urlopen(
         "http://t-rechnik.info/search.php?" + cooked)
     html = html.read().decode("utf8")
     root = lxml.html.fromstring(html)
     tbl = root.get_element_by_id("table")
     if len(tbl) == 3:
         txt = tbl[2].text_content()
         txt = re.sub(r"\r|\n", " ", txt)
         txt = re.sub(r"\s+", " ", txt)
         self.bot.privmsg(self.bot.sender[0], txt, option="multiline")
         return
     return "nothing found"
示例#23
0
def Scrape(tech, city, starting_page: int, ending_page: int):
    generalList = []
    #generalList.append('Job Title;;;Employer Name;;;Salary;;;Link;;;Seniority;;;describtion;;;experience')

    if not city: city = 'warszawa'

    for i in range(starting_page, ending_page + 1):
        #print("Trying crawling on page "+ str(i) + "/" + str(ending_page))
        if tech:
            url = 'https://nofluffjobs.com/pl/jobs/' + city + '/' + tech + '?criteria=city%3D' + \
                city + '%20' + tech + '&page=' + str(i)
        else:
            url = 'https://nofluffjobs.com/pl/jobs/' + city + '?criteria=city%3D' + \
                  city + '&page=' + str(i)
    # dodac sytaucje kiedy ani tech ani lokalizacja nie jest podana
        try:
            html = urlopen(url)
            #print("HTML found (1/3)")
        except HTTPError as e:
            #print('HTML does not exist')
            break
        except URLError as e:
            #print("Server not found")
            break
        else:
            pass
            #print("Successfully connected to the server! (2/3)")

        bs = BeautifulSoup(html.read(), 'html.parser')

        title = getTitle(bs)
        employer = getEmployer(bs)
        salary = getSalary(bs)
        link = getLinks(bs)
        seniority = getSeniority(link)
        desc = getDescription(link)
        experience = getExperience(desc)
        for i in range(countOffers(bs)):
            #Tutaj tworzy te obiekty oferta
            jobOffer = Oferta(title[i], employer[i], salary[i], link[i],
                              seniority[i], desc[i], experience[i])
            # jobOffer = "%s;;;%s;;;%s;;;%s;;;%s;;;%s;;;%s" % (title[i], employer[i], salary[i],
            #                                      link[i], seniority[i], desc[i], experience[i])
            generalList.append(jobOffer)

    return generalList
def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title

    title = getTitle(url)
    if title == None:
        return "Title could not be found"
    else:
        return title
    def main(self):
        '''Scrapes function name, argument list, description for argument, URL for description, URL for examples.'''
        html = urllib.urlopen(self.url)
        html = html.read()
        maintree   = etree.parse(StringIO(html), self.parser)
        mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data

        main_h1 = [ child for child in mainContent[0].iterchildren('h1') ]      #get its child h1
        contentHTML= (etree.tostring(main_h1[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        title_text = tree.xpath("//text()")[0].strip()      #title_text

        all_content = [ child for child in mainContent[0].iterchildren('div') ]     # get its child div
        contentHTML= (etree.tostring(all_content[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        all_content_class = tree.xpath("//@class")[0].strip()      
        if all_content_class=='toctree-wrapper compound':
            main_ul = [ child for child in all_content[0].iterchildren('ul') ]      #get its child ul    
        else:
            main_ul = [ child for child in all_content[1].iterchildren('ul') ]      #get its child ul

        main_li = [ child for child in main_ul[0].iterchildren('li') ]      #get its child li
        for each_li in main_li:
            main_a = [ child for child in each_li.iterchildren('a') ]      #get its child a
            sectionHTML= (etree.tostring(main_a[0], pretty_print=True))
            tree   = etree.parse(StringIO(sectionHTML), self.parser)
            main_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip()
            main_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip()
            # main_topic, main_topic_link

            sub_ul = [ child for child in each_li.iterchildren('ul') ]      #get its child ul
            if len(sub_ul)!=0:
                sub_li = [ child for child in sub_ul[0].iterchildren('li') ]      #get its children li
                for each_sub_li in sub_li:
                    sectionHTML= (etree.tostring(each_sub_li, pretty_print=True))
                    tree   = etree.parse(StringIO(sectionHTML), self.parser)
                    sub_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip()
                    sub_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip()
                    topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link,
                    'sub_topic': sub_topic, 'sub_topic_link': self.url+sub_topic_link}
                    self.get_all_functions(topics['sub_topic_link'], topics)
            else:
                topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link,
                    'sub_topic': '', 'sub_topic_link': ''}
                self.get_all_functions(topics['main_topic_link'], topics)
示例#26
0
    def save_model(self, request, obj, form, change):
        if obj and form.is_valid():
            toc = None
            excerpt = None
            if 'original_file' in form.changed_data:
                if obj.html_file:
                    obj.html_file.delete(save=False)
                f = request.FILES['original_file']
                html = _original_file_to_html(f)
                obj.html_file.save(obj.title+'.html', html, save=False)
                obj.html_file.close()
                html.seek(0)
                htmltree = lxml.html.fromstring(html.read().decode('utf-8'))
                toc = get_html_toc(htmltree)
                excerpt = get_html_excerpt(htmltree)
                f.close()

            obj.save(toc, excerpt)
示例#27
0
def parse_web_page(url, xpaths=None, links=False):
    """Parse a response returned by a URL.

    The response can be parsed on the basis of xpaths determined by the URL's
    Resource instance or the xpaths given. If the response is to be parsed based
    on the former, the xpaths can be normal or related to link extraction, and
    thus patch-finding/recursion.

    Args:
        url (str): The URL to be parsed.
        xpaths (list[str]): A list of xpaths to parse the response with respect
            to. Defaults to None. If None, the xpaths are taken from the URL's
            corresponding Resource instance.
        links (bool): If True, the links xpaths are used from the corresponding
            Resource, else the normal xpaths are. Defaults to False.

    Returns:
        list[str]: A list of strings scraped from the determined or given
            xpaths.

    Raises:
        Exception: If there is an error in opening the given URL.
    """
    logger.info("Opening %s...", url)
    try:
        html = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        raise Exception("Error opening {url}".format(url=url))
    logger.info("Crawled %s", url)

    search_results = []
    if not xpaths:
        if not links:
            xpaths = Resource.get_resource(url).normal_xpaths
        else:
            xpaths = Resource.get_resource(url).links_xpaths
    elements = lxml.html.fromstring(html.read())
    for element in elements:
        if element.tag != "body":
            continue
        for xpath in xpaths:
            search_results.extend(element.xpath(xpath))
        break
    return search_results
示例#28
0
    def __init__(self, url):
        print("load codeforces problem %s" % url)
        html = request.urlopen(url)
        self.problem_id = CFProblem.get_problem_id(url)
        self.pdf_name = 'CF' + self.problem_id + '.pdf'
        self.dom = lxml.html.fromstring(html.read())
        self.contest_name = self.dom.xpath('//*[@id="sidebar"]/div[1]/table/tbody/tr[1]/th/a')[0].text

        base_tag = lxml.html.Element('base', href="https://%s" % urlparse(url).netloc)
        style_tag = lxml.html.Element('style')
        style_tag.text = '#pageContent>*:not(.problemindexholder) { display: none !important; } #header { display: none; } #footer { display: none; } .roundbox.menu-box { display: none; } #sidebar { display: none; } #body > br:nth-child(8) { display: none; } #pageContent { margin-right: 0 !important; } #body { padding-top: 0; } #MathJax_Message { display: none !important; }'
        self.dom.xpath('//html')[0].insert(0, base_tag)
        self.dom.xpath('//head')[0].append(style_tag)

        contest_tag = lxml.html.Element('div')
        contest_tag.text = self.contest_name
        #contest_tag.attrib['class'] = 'title'
        contest_tag.attrib['style'] = 'text-align: left;'
        self.dom.xpath('//*[@class="header"]')[0].insert(0, contest_tag)
示例#29
0
    def get_prefectures(self):
        # STEP.1 都道府県ののリストを取得

        try:
            html = urlopen(URL_TOP)
            soup = BeautifulSoup(html.read(), "lxml")
            links = soup.select("table tr td a")

            for link in links:
                exclusion = str(link).count('HOME') or str(link).count(
                    '都道府県') or str(link).count('メール送信')
                if exclusion:
                    continue
                href = link.get('href')
                self.pref_list.append({'url': href, 'name': link.text})

        except Exception as e:
            print('-----page not found.-----')
            print(e)
            self.pref_list = None
示例#30
0
    def _load_html(self, html, parser=lxml.html.parse):
        self.form_files = {}

        if hasattr(html, 'seek'):
            html.seek(0)

        if isinstance(html, (unicode, str)):
            html = StringIO(html)

        if isinstance(html, requests.Response):
            html = StringIO(html.content)

        if len(html.read()) == 0:
            self.document = None
            return None
        else:
            html.seek(0)
            self.document = parser(html)

            return html
def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()
        
        round = roundPage.cssselect("li[id='tpRound'] a")[0].text_content().replace("round ", "").replace(" Rankings", "").strip()
        print "Round: " + round
        
        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")
        
        #remove the "next page" link
        del pageLinks[-1]
        
        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL
        
            scrapePage(linkURL, round)
示例#32
0
文件: test7.py 项目: demid5111/nlp
def main():
    html = urllib.urlopen(url)
    doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore'))
    post = doc.cssselect('div.main .person-appointment-title')[0]
    print post.text#должность
    post1 = urllib.urlencode(post)
    #print p
    academictitle = doc.cssselect('div.main .person-appointment-title')[1]
    print academictitle.text#ученое звание
    academictitle1 = urllib.urlencode(academictitle)
    fio = doc.cssselect('div.footer__breadcrumbs .b ')[0]#ФИО
    print fio.text#ФИО
    fio1 = urllib.urlencode(fio)
    items = doc.cssselect('div.g-pic')
    for item in items:
        image = item.get('style')
        #print image
    s = image.split("'")
    #print s[1]
    page = 'hse.ru'+s[1]#адрес страницы, где находится фотография
    print page #hse.ru/pubs/share/direct/138568616
    dictionary = {'post':post1,'academic title': academictitle1, 'fio': fio1, 'photo': page}#словарь который нужно преобразовать для JSON
    print dictionary #print dictionary {'academic title': 'class=person-appointment-title', 'post': 'class=person-appointment-title', 'fio': 'class=b', 'photo': 'hse.ru/pubs/share/direct/138568616'}
    #print(json.dumps((d),sort_keys=True))
    json_data = json.dumps(dictionary)
    print (json.dumps(dictionary, sort_keys=True, indent=4, separators=(',', ': ')))
    #Результат печати словаря в формате JSON данных
    # {
        #"academic title": "class=person-appointment-title",
        #"fio": "class=b",
        #"photo": "hse.ru/pubs/share/direct/138568616",
        #"post": "class=person-appointment-title"
    # }
    # Почему то значения ключей печатаются не само значение поля, например как с ФИО, а печатается только имя класса, где находится?
    # Итак со всеми нужными нам полями
    elements_json = json.loads(json_data)
    print elements_json["post"]#доступ по ключу
    #class=person-appointment-title
    return json_data
示例#33
0
    def get_cities(self):
        # STEP2. 市区町村のリストを取得
        if self.pref_list is None:
            return

        for pref in self.pref_list:

            target_url = URL_TOP + pref['url']

            try:
                df = pd.DataFrame(columns=df_columns)
                html = urlopen(target_url)
                soup = BeautifulSoup(html.read(), "lxml")
                links = soup.select("center table tr td a")

                for link in links:
                    if str(link).count('☆'):
                        continue
                    href = link.get('href')
                    arr = href.split("//")
                    domain = arr[1]
                    domain = domain[:-1]

                    data = {
                        "pref": pref['name'],
                        "name": link.text,
                        "top_url": href,
                        'domain': domain
                    }
                    df = df.append(data, ignore_index=True)
                    print(data)

                self.pref_df = pd.concat([self.pref_df, df])

            except Exception as e:
                print('-----page not found.-----')
                print(e)
示例#34
0
    def __get_flat_details__(self, link, flat_params):
        """
        Функция получает url страницы с информацией о
        квартире. Возвращает словарь название параметра (как на
        странице) -> значение
        """
        url = settings.SITE_ROOT + link

        html = self.__get_url__(url)
        xhtml = lxml.html.fromstring(html.read())

        cells = xhtml.xpath(settings.DETAIL_CELLS_XPATH)

        result = dict()
        result[u"URL"] = url

        for i in range(len(cells) / 2):
            value = cells.pop().text_content()
            name = cells.pop().text_content()
            name = re.sub(":", "", name)
            name = name.strip()
            if name in flat_params:
                result[name] = value
        return result
示例#35
0
 def __init(self,url):
     self.url = url
     html = urllib2.urlopen(url)
     self.source = lxml.html.fromstring(html.read())
示例#36
0
# python3
import urllib.request
import lxml.html
import re

url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/'
html = urllib.request.urlopen(url)
scode=html.read().decode('utf-8')

doc = lxml.html.document_fromstring(scode)
ss = doc.xpath("""//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""")
bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""")

aa= list(zip(ss,bb))

print(aa)
def accident_records():
    print "reached"
    all_accidents = []
    for file_name in range(29):
        file_name = APP_ROOT + "/accidentApp" + "/try/" + str(
            file_name) + ".html"
        print file_name
        try:
            html = urllib.urlopen(file_name)
        except:
            continue
        html = html.read()
        i = 1

        while True:
            if i == 1:
                my_iter = 1
                my_iter2 = 3
            else:
                my_iter = 0
                my_iter2 = 0
            root1 = lxml.html.fromstring(html)
            try:
                main_content = root1.cssselect('div#pf' + str(i))
                i += 1

            except:
                break
            print main_content
            if main_content == []:
                break

            node = main_content[0]
            try:
                content_date = node.cssselect('div.x4')[my_iter:]
                content_time = node.cssselect('div.x4 div.t')[my_iter:]
                content_location = node.cssselect('div.x4 div.t')[my_iter:]

                death_1 = node.cssselect('div.x12')[my_iter2:]
                death_2 = node.cssselect('div.x1d')[my_iter:]
                death_3 = node.cssselect('div.x1e')[my_iter:]
                death_4 = node.cssselect('div.x1f')[my_iter:]

                injury_1 = node.cssselect('div.x13')[my_iter2:]
                injury_2 = node.cssselect('div.x20')[my_iter:]
                injury_3 = node.cssselect('div.x21')[my_iter:]
                injury_4 = node.cssselect('div.x22')[my_iter:]

                injury2_1 = node.cssselect('div.x14')[my_iter2:]
                injury2_2 = node.cssselect('div.x23')[my_iter:]
                injury2_3 = node.cssselect('div.x24')[my_iter:]
                injury2_4 = node.cssselect('div.x25')[my_iter:]

                vehicle_1 = node.cssselect('div.x15')
                vehicle_2 = node.cssselect('div.x26')
                vehicle_3 = node.cssselect('div.x27')
                vehicle_4 = node.cssselect('div.x28')
                vehicle_5 = node.cssselect('div.x29')
                vehicle_6 = node.cssselect('div.x2a')
                vehicle_7 = node.cssselect('div.x2b')
                vehicle_8 = node.cssselect('div.x2c')

                vehicle_damaged = node.cssselect('div.x18')[1:]
                rows = zip(content_date, content_time, content_location,
                           death_1, death_2, death_3, death_4, injury_1,
                           injury_2, injury_3, injury_4, injury2_1, injury2_2,
                           injury2_3, injury2_4, vehicle_1, vehicle_2,
                           vehicle_3, vehicle_4, vehicle_5, vehicle_6,
                           vehicle_7, vehicle_8, vehicle_damaged)

            except:
                pass

            for item in rows:
                try:
                    print "------------------------------"
                    accident = {}
                    my_date = map_number(item[0].cssselect("div.t")
                                         [0].text_content().split()[0])
                    print my_date
                    accident["year"] = my_date.split(".")[0]
                    accident["month"] = my_date.split(".")[1]
                    accident["day"] = my_date.split(".")[2]

                    time = map_number(item[0].cssselect("div.t")
                                      [1].text_content().split()[0])
                    accident["hour"] = time.split(":")[0]
                    accident["minute"] = time.split(":")[1]

                    accident["location"] = item[0].cssselect(
                        "div.t")[2].text_content().strip()
                    death = 0
                    for each_death in item[3:7]:
                        death += int(each_death.text_content().strip() or 0)

                    injury = 0
                    for each_injury in item[7:15]:
                        injury += int(each_injury.text_content().strip() or 0)
                    accident["death"] = death
                    accident["injury"] = injury
                    accident["vehicle_damaged"] = int(
                        item[-1].text_content().strip() or 0)

                    all_accidents.append(accident)
                    #print all_accidents
                except:
                    pass
    print all_accidents
    return all_accidents
import re    
import xml.etree.ElementTree as ET

# Blank Python
#import json #for json decoding
from lxml import etree     
from cStringIO import StringIO
import urllib

import re

totalLinks =[]
for i in range(21)[1:]:
    strAddr = "http://codingtrying.herobo.com/"+str(i)+".html"
    html = urllib.urlopen(strAddr)
    html = html.read()
    parser = etree.HTMLParser()
    tree   = etree.parse(StringIO(html), parser)
    mainContent = tree.xpath("//th[@class='rowA']/a/@href")
    for content in mainContent:
        if content !="http://www.dlapiper.com/us/people/#":
            totalLinks.append(content)




i=0;
for url in totalLinks:
    if i<=481:
        i=i+1
        continue
    def get_function_details(self, func_details, topics):
        html = urllib.urlopen(func_details['function_link'])
        html = html.read()
        self.parser = etree.HTMLParser()
        maintree   = etree.parse(StringIO(html), self.parser)
        mainContent1 = maintree.xpath("//dl[@class='method']")     #scrape main div containing data
        mainContent2 = maintree.xpath("//dl[@class='function']")     #scrape main div containing data
        if len(mainContent1)==0 and len(mainContent2)!=0:
            mainContent = mainContent2
        elif len(mainContent2)==0 and len(mainContent1)!=0:
            mainContent = mainContent1
        elif len(mainContent1)==0 and len(mainContent2)==0:
            return
        argument_list = [ child for child in mainContent[0].iterchildren('dt') ]      #get its child dt    
        contentHTML= (etree.tostring(argument_list[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        argument_list = tree.xpath("//text()")
        argument_list = ''.join(argument_list[1:len(argument_list)-1]).encode('utf-8').strip()
        
        # getting details for each args
        split_data = argument_list.split('(')
        full_function_name = split_data[0]
        sec_split_data = split_data[1].split(')')
        args = sec_split_data[:-1]
        arg_dict = {}
        if len(args)!=0:
            args = args[0].split(',')
            for each_arg in args:
                each_split = each_arg.split('=')
                if len(each_split)==1:
                    if each_arg.find('.')== -1:
                        arg_dict[each_arg] = {'optional_flag': 0, 'default_value': ''}
                else:
                    if each_split[0].find('.')== -1:
                        arg_dict[each_split[0]] = {'optional_flag': 1, 'default_value': each_split[1]}

        # parsing examples
        examples = ''
        dd =  [ child for child in mainContent[0].iterchildren('dd') ]      #get its child dd
        example_div =  [ child for child in dd[0].iterchildren('div') ]      #get its child div
        if len(example_div)!=0:
            contentHTML= (etree.tostring(example_div[0], pretty_print=True))
            tree   = etree.parse(StringIO(contentHTML), self.parser)
            example_div_class = tree.xpath("//@class")
            if example_div_class[0] == 'highlight-python':
                examples = tree.xpath("//text()")
                examples = ''.join(examples)

        parameters_table = [ child for child in mainContent[0].iterdescendants('table') ]      #get its child table
        if len(parameters_table)!=0:
            contentHTML= (etree.tostring(parameters_table[0], pretty_print=True))
            tree   = etree.parse(StringIO(contentHTML), self.parser)
            table_class = tree.xpath("//@class")
            if table_class[0] == 'docutils field-list':
                all_desc = [ child for child in parameters_table[0].iterdescendants('tr') ]      #get its child tr            
                # for parameters
                argument_desc = [ child for child in all_desc[0].iterchildren('td') ]      #get its child td       
                contentHTML= (etree.tostring(argument_desc[0], pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                argument_desc_list = tree.xpath("//text()")
                para_arg={}
                para_arg['argument_desc'] = ''.join(argument_desc_list).encode('utf-8').strip()
                # for returns
                if len(all_desc) == 2:
                    parameter_desc = [ child for child in all_desc[1].iterchildren('td') ]      #get its child td
                    contentHTML= (etree.tostring(parameter_desc[0], pretty_print=True))
                    tree   = etree.parse(StringIO(contentHTML), self.parser)
                    parameter_desc_list = tree.xpath("//text()")
                    para_arg['parameter_desc'] = ''.join(parameter_desc_list).encode('utf-8').strip()
                para_arg['parameter_desc'] = para_arg.get('parameter_desc') if para_arg.get('parameter_desc')!=None else ''

                # final_data = {'function_name':func_details['function_name'],
                final_data = {'function_name':full_function_name,
                            'function_link':func_details['function_link'],
                            'function_description':func_details['function_desc'],
                            'argument_list':arg_dict,
                            'argument_description':para_arg['argument_desc'],
                            'return_parameter':para_arg['parameter_desc'],
                            'examples': examples,
                            'sub_topic':topics['sub_topic'],
                            'sub_topic_link':topics['sub_topic_link'],
                            'main_topic':topics['main_topic'],
                            'main_topic_link':topics['main_topic_link']}
                #write to mongodb
                self.mongo_obj.write_data(self.table_name, final_data)


        else:
            final_data = {'function_name':full_function_name,
                            'function_link':func_details['function_link'],
                            'function_description':func_details['function_desc'],
                            'argument_list':arg_dict,
                            'argument_description':'',
                            'return_parameter':'',
                            'examples': examples,
                            'sub_topic':topics['sub_topic'],
                            'sub_topic_link':topics['sub_topic_link'],
                            'main_topic':topics['main_topic'],
                            'main_topic_link':topics['main_topic_link']}
            self.mongo_obj.write_data(self.table_name, final_data)
示例#40
0
def accident_records():
    print "reached"
    all_accidents =[]
    for file_name in range(29):
        file_name = APP_ROOT+"/accidentApp"+"/try/"+str(file_name)+".html"
        print file_name
        try:
            html = urllib.urlopen(file_name)
        except:
            continue
        html = html.read()
        i =1

        while True:
            if i ==1:
                my_iter = 1
                my_iter2 = 3
            else:
                my_iter = 0
                my_iter2 = 0
            root1 = lxml.html.fromstring(html)
            try:
                main_content = root1.cssselect('div#pf'+str(i))
                i += 1

            except:
                break
            print main_content
            if main_content == []:
                break


            node = main_content[0]
            try:
                content_date = node.cssselect('div.x4')[my_iter:]
                content_time = node.cssselect('div.x4 div.t')[my_iter:]
                content_location = node.cssselect('div.x4 div.t')[my_iter:]
                
                death_1 = node.cssselect('div.x12')[my_iter2:]
                death_2 = node.cssselect('div.x1d')[my_iter:]
                death_3 = node.cssselect('div.x1e')[my_iter:]
                death_4 = node.cssselect('div.x1f')[my_iter:]    

                injury_1 = node.cssselect('div.x13')[my_iter2:]
                injury_2 = node.cssselect('div.x20')[my_iter:]
                injury_3 = node.cssselect('div.x21')[my_iter:]
                injury_4 = node.cssselect('div.x22')[my_iter:]

                injury2_1 = node.cssselect('div.x14')[my_iter2:]
                injury2_2 = node.cssselect('div.x23')[my_iter:]
                injury2_3 = node.cssselect('div.x24')[my_iter:]
                injury2_4 = node.cssselect('div.x25')[my_iter:]
                
                vehicle_1 = node.cssselect('div.x15')
                vehicle_2 = node.cssselect('div.x26')
                vehicle_3 = node.cssselect('div.x27')
                vehicle_4 = node.cssselect('div.x28')
                vehicle_5 = node.cssselect('div.x29')
                vehicle_6 = node.cssselect('div.x2a')
                vehicle_7 = node.cssselect('div.x2b')
                vehicle_8 = node.cssselect('div.x2c')
                
                vehicle_damaged = node.cssselect('div.x18')[1:]
                rows = zip(content_date, content_time, content_location,
                        death_1, death_2, death_3, death_4,
                        injury_1, injury_2, injury_3, injury_4,
                        injury2_1, injury2_2, injury2_3, injury2_4,
                        vehicle_1, vehicle_2, vehicle_3, vehicle_4, vehicle_5,
                        vehicle_6, vehicle_7, vehicle_8, vehicle_damaged)
                
            except:
                pass

            for item in rows:
                try:
                    print "------------------------------"
                    accident = {}
                    my_date =  map_number(item[0].cssselect("div.t")[0].text_content().split()[0])
                    print my_date
                    accident["year"] = my_date.split(".")[0]
                    accident["month"] = my_date.split(".")[1]
                    accident["day"] = my_date.split(".")[2]
                    

                    time =  map_number(item[0].cssselect("div.t")[1].text_content().split()[0])
                    accident ["hour"] = time.split(":")[0]
                    accident["minute"] = time.split(":")[1]

                    accident["location"] =  item[0].cssselect("div.t")[2].text_content().strip()
                    death = 0
                    for each_death in item[3:7]:
                        death+= int(each_death.text_content().strip() or 0)
                    
                    injury = 0
                    for each_injury in item[7:15]:
                        injury+= int(each_injury.text_content().strip() or 0)
                    accident["death"] = death
                    accident ["injury"] = injury
                    accident ["vehicle_damaged"] = int(item[-1].text_content().strip() or 0)
                    
                    all_accidents.append(accident)
                    #print all_accidents
                except:
                    pass
    print all_accidents
    return all_accidents
示例#41
0
import time
import sys
import codecs
import lxml.html
import urllib2

query = 'http://www39.atwiki.jp/osakahennyu/?cmd=backup&action=source&pageid=<PLACEHOLDER>&num=0'

for line in open(sys.argv[1], 'r'):
	url = query.replace('<PLACEHOLDER>', line.rstrip())

	while True:
		try:
			html = urllib2.urlopen(url)

			code = unicode(html.read(), 'utf-8')
			dom  = lxml.html.fromstring(code)
			wiki = dom.xpath('//pre')[0]
			
			fout = codecs.open(line.rstrip() + '.txt', 'w', 'utf-8')
			fout.write(wiki.text)
			fout.close()

			html.close()
			break
			
		except urllib2.HTTPError:
			raw_input('>>> error! press continue...')

	time.sleep(1)
示例#42
0
import re
import xml.etree.ElementTree as ET

# Blank Python
#import json #for json decoding
from lxml import etree
from cStringIO import StringIO
import urllib

import re

totalLinks = []
for i in range(21)[1:]:
    strAddr = "http://codingtrying.herobo.com/" + str(i) + ".html"
    html = urllib.urlopen(strAddr)
    html = html.read()
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(html), parser)
    mainContent = tree.xpath("//th[@class='rowA']/a/@href")
    for content in mainContent:
        if content != "http://www.dlapiper.com/us/people/#":
            totalLinks.append(content)

i = 0
for url in totalLinks:
    if i <= 481:
        i = i + 1
        continue
    try:
        page = scraperwiki.scrape(url)
        html = bs.BeautifulSoup(page)
    def scrape_section(self, element, topics, scipy_first=False, all_info=None):
        if scipy_first:
            h1_topic = [ child for child in element.iterchildren('h1') ]      #get its child h1    
            actual_link = [ child for child in h1_topic[0].iterchildren('a') ]      #get its child a
            if len(actual_link)==2:
                contentHTML= (etree.tostring(actual_link[0], pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                actual_link = tree.xpath("//@href")[0].split('/')
                if actual_link[0]== '..':
                    html = urllib.urlopen(self.url + actual_link[1])
                    html = html.read()
                    maintree   = etree.parse(StringIO(html), self.parser)
                    mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data
                    self.scrape_section(mainContent[0], topics)
            else:
                return    
        else:        
            main_topics = [ child for child in element.iterchildren('div') ]      #get its child div
            for each_topic in main_topics:
                contentHTML= (etree.tostring(each_topic, pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                div_class = tree.xpath("//@class")
                if div_class[0] == 'section':
                    title = [ child for child in each_topic.iterchildren('h2') ]      #get its child h2
                    mini_title, information='',''
                    if len(title)==0:
                        title = [ child for child in each_topic.iterchildren('h3') ]      #get its child h3
                    if len(title)!=0:
                        titleHTML= (etree.tostring(title[0], pretty_print=True))
                        title_tree   = etree.parse(StringIO(titleHTML), self.parser)
                        mini_title = title_tree.xpath("//text()")[0].encode('utf-8').strip()
                    if self.url == 'http://docs.scipy.org/doc/numpy/user/':
                        info = [ child for child in each_topic.iterchildren('p') ]      #get its child para
                        if len(info)!=0:
                            infoHTML= (etree.tostring(info[0], pretty_print=True))
                            info_tree   = etree.parse(StringIO(infoHTML), self.parser)
                            information = info_tree.xpath("//text()")[0].encode('utf-8').strip()
                            if all_info!=None:
                                info_details = {'mini_title': mini_title, 'mini_info': information,
                                'parent_title': all_info.get('mini_title'), 'parent_info': all_info.get('mini_info')}
                            else:
                                info_details = {'mini_title': mini_title, 'mini_info': information}
                        else:
                            info_details = {'mini_title': mini_title, 'mini_info': information}
                        self.scrape_section(each_topic, topics, all_info=info_details)

                    else:
                        self.get_func_tables(each_topic, topics)     # check if table of functions exists
                        # check if there is a section div within the div
                        self.scrape_section(each_topic, topics)   
                else:
                    if self.url == 'http://docs.scipy.org/doc/numpy/user/' and all_info!=None:
                        final_data = {'sub_topic':topics['sub_topic'],
                                    'sub_topic_link':topics['sub_topic_link'],
                                    'main_topic':topics['main_topic'],
                                    'main_topic_link':topics['main_topic_link']}
                        if all_info.get('parent_title')==None and all_info.get('parent_info')==None:
                            final_data['parent_title'] = all_info['mini_title']
                            final_data['parent_info'] = all_info['mini_info']
                            final_data['mini_title'] = ''
                            final_data['mini_info'] =''

                            self.mongo_obj.write_data(self.table_name, final_data)
                        else:
                            final_data['parent_title'] = all_info.get('parent_title')
                            final_data['parent_info'] = all_info.get('parent_info')
                            final_data['mini_title'] = all_info['mini_title']
                            final_data['mini_info'] =all_info['mini_info']

                            self.mongo_obj.write_data(self.table_name, final_data)
示例#44
0
#tf.close()
###########################################################################

######인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api#

client_id = '71b5f772fc5a467fbb4e6066ecbe9536'

access_token = "1451885321.71b5f77.ddac6f3e719c4afb8375ab1dda874fd9"
client_secret = "8fbf9fa995804da09d587ec0a3819e01"

api = InstagramAPI(access_token=access_token, client_secret=client_secret)
result = api.tag_recent_media(100, 10, moviename)

url = result[1]
html = urllib.urlopen(url)  ##데이터를 받을 수 있는 url을 urllib모듈을 통해서 오픈시키고 넣음
htmlread = html.read().decode('utf-8')  ## 읽어서 htmlread에 넣음

jjson = json.loads(htmlread)
data = jjson['data']

try:
    #saveFile = open('result.txt', 'a')

    for i in range(0, len(data)):

        a = data[i]
        tag = a['tags']

        for i in range(0, len(tag)):
            #saveFile.write(str(tag[i].encode('utf-8'))+" ")
            finaldata = finaldata + tag[i] + u" "
示例#45
0
from BeautifulSoup import BeautifulSoup
import re
import urllib
import lxml.html
import string
import json
import pickle

for char in string.uppercase:
    movieInfoList = []
    html = urllib.urlopen('http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' + char)
    soup = BeautifulSoup(html.read())

    #print soup.html.head.title.string
    items = soup.findAll("div",attrs={"id":"divMain"})[0].contents[0].contents 

    movielinks = []
    for item in items:
        try:
            movielinks.append(item.contents[0].contents[0].attrs[0][1])
        except IndexError:
            print "IndexError"
            pass

    #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179']

    for link in movielinks:

        movieInfo = {}

        arr = link.split("/")
示例#46
0
#!/usr/bin/env python

import scraperwiki
import requests
import lxml.html

from bs4 import BeautifulSoup
 
import requests


html=open('index','r')
print html.read()
soup = BeautifulSoup(html)
print(soup.pretify)
示例#47
0
from BeautifulSoup import BeautifulSoup
import re
import urllib
import lxml.html
import string
import json
import pickle

for char in string.uppercase:
    movieInfoList = []
    html = urllib.urlopen(
        'http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' +
        char)
    soup = BeautifulSoup(html.read())

    #print soup.html.head.title.string
    items = soup.findAll("div", attrs={"id":
                                       "divMain"})[0].contents[0].contents

    movielinks = []
    for item in items:
        try:
            movielinks.append(item.contents[0].contents[0].attrs[0][1])
        except IndexError:
            print "IndexError"
            pass

    #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179']

    for link in movielinks:
示例#48
0
            while len(word) > index:
                val=word[index]
                index =index+1
        else:
            val="NULL"

        #print var+val+"\n"
        if val == None:
            return
        
        di[var]=val
        return val

#html = requests.get("http://www.autolanka.com/Buy.asp").content
html=open('index','r')
dom = lxml.html.fromstring(html.read())


varia=["Code:","Added:","Make:","Model:","No:","Year:","Location:","Options:","Price:","Info:"]
di={}
ads={}
for entry in dom.cssselect('.BuyDataTD'):
    [extract(var,entry,di) for var in varia]
    if len(di)==10:
        print di



    #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads):
    #   ads[di['Code:'].replace("Code:","")]=di
    #print ads
示例#49
0
                val = word[index]
                index = index + 1
        else:
            val = "NULL"

        #print var+val+"\n"
        if val == None:
            return

        di[var] = val
        return val


#html = requests.get("http://www.autolanka.com/Buy.asp").content
html = open('index', 'r')
dom = lxml.html.fromstring(html.read())

varia = [
    "Code:", "Added:", "Make:", "Model:", "No:", "Year:", "Location:",
    "Options:", "Price:", "Info:"
]
di = {}
ads = {}
for entry in dom.cssselect('.BuyDataTD'):
    [extract(var, entry, di) for var in varia]
    if len(di) == 10:
        print di

    #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads):
    #   ads[di['Code:'].replace("Code:","")]=di
    #print ads
示例#50
0
#! python3

import urllib.request
import lxml.html
import re

url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/'
html = urllib.request.urlopen(url)
scode = html.read().decode('utf-8')

doc = lxml.html.document_fromstring(scode)
ss = doc.xpath(
    """//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""")
bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""")

aa = list(zip(ss, bb))

print(aa)
示例#51
0
}
web = {}
web['新闻'] = 'https://searchcloudcomputing.techtarget.com.cn/news/'
for key in web:
    with open('D:/' + key + '.csv', 'w', newline='',
              encoding='utf-8-sig') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('title', 'abstract', 'type', 'content'))
        for i in range(2, 407):
            try:
                print((key + '%.2f' % ((i - 1) / 407 * 100)) + "%")
                req = request.Request(
                    'https://searchcloudcomputing.techtarget.com.cn/interviews/page/3/',
                    headers=headers)
                html = urlopen(req)
                bsObj = BeautifulSoup(html.read(), "html.parser")
                print(bsObj.text)
                bs = bsObj.find_all('h4', attrs={'class': 'newslist'})
                print(bs)
                for j in bs:
                    req = request.Request(j.find('a').get('href'),
                                          headers=headers)
                    print(j.find('a').get('href'))
                    html = urlopen(req)
                    bsObj = BeautifulSoup(html.read(), "html.parser")
                    bs = bsObj.find_all(name='div',
                                        attrs={'class': 'newslist'})
                    content = ''
                    for i in bs:
                        content = f'{content}{i.text}'
                    title = bsObj.find_all('h1')
def getHtml(url):
    html = urllib2.urlopen(url)
    page = lxml.html.fromstring(html.read())
    html.close()

    return page