Python People示例，db.operateSql.People Python示例

示例#1

0

显示文件

文件： engr_fit_parser.py 项目： leisun123/scholar-private

def save_info(name, email, web_url, img_url, major, org):
    user = People(email=email,
                  name=name,
                  major=major,
                  web=web_url,
                  orginazation=org)
    session.add(user)
    try:
        session.commit()
    except:
        session.rollback()
    time.sleep(1)
    if img_url is not None and email is not None:
        print(img_url)
        try:
            pic = requests.Session().get(img_url, timeout=30)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/pic/" +
                    email + ".jpg", "wb") as f:
                f.write(pic.content)
                f.close()
        except:
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                    "a") as f:
                f.write(email + " : " + img_url)
                f.close()

示例#2

0

显示文件

def getInfo(url):
    try:
        res = fetch(url)
    except:
        return getInfo(url)
    tmp = extract("//div[starts-with(@class,'single-person-entry')]", res,
                  True)

    for each in tmp:
        source = str(etree.tostring(each))
        # print(source)
        title = extract("//span[@class='personlist-title']/text()", source)
        if title is not None and "Emeritus" in title:
            continue

        # print(title)

        # print(source)

        url = extract("//img[@class='left person-list']/@src", source)
        # print(url)
        if url is not None:
            pic_url = "http://engineering.unm.edu/faculty/directory/" + url
        else:
            pic_url = ""
        email = extract("//table/tr[1]/td[2]/a/@href", source)
        # print(email)
        if email is not None:
            email = email.split(':')[-1]
        else:
            continue
        name = extract("//h4/a/text()", source)
        web_url = extract("//h4/a/@href", source)
        if web_url is not None:
            web = "http://engineering.unm.edu/faculty/directory/" + web_url
        else:
            web = ""
        major = extract("//p[@class='areas']/text()", source)
        if major:
            major = "Department of " + major
        print(pic_url, " ", name, " ", email, " ", web, " ", major)
        try:
            pic = requests.Session().get(pic_url, timeout=30)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/pic/" +
                    email + ".jpg", "wb") as f:
                f.write(pic.content)
                f.close()
        except:
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                    "a") as f:
                f.write(email + " : " + pic_url + "\n")
        user = People(email=email, name=name, major=major, web=web)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#3

0

显示文件

文件： ece_LSU_parser.py 项目： leisun123/scholar-private

def get_info(url, major):
    global img_url
    try:
        html = fetch(url)
        tmp = extract('//table[@id="scholarships"]/tbody/tr', html, True)
        # print(tmp)
    except:
        return get_info(url, major)

    for i in tmp:
        each = str(etree.tostring(i))
        web = extract("//a/@href", each)
        if web:
            web_url = "https://www.lsu.edu/" + web
            try:

                browser = webdriver.Chrome(chrome_options=SelenuimParser(
                    stopimage=2))
                browser.get(web_url)
                browser.set_page_load_timeout(3)

            except TimeoutException as e:
                print(e)
                browser.execute_script('window.stop()')
            finally:
                # source = str(etree.tostring(browser.page_source))
                source = browser.page_source
                # print(source)
                email = extract("//a[contains(@href, '@')]/text()", source)
                # print(email)
                name = extract("//h1[@class='fac-name']/text()", source)
                img_url = extract("//div[@class='fac-photo']/img/@src", source)
                print(name, email, major, web_url, img_url)
                user = People(email=email, name=name, major=major, web=web_url)
                session.add(user)
                try:
                    session.commit()
                except:
                    session.rollback()
                time.sleep(1)
                if img_url is not None and email is not None:
                    # img = "http://be.utdallas.edu" + img_url
                    print(img_url)
                    try:
                        pic = requests.Session().get(img_url, timeout=30)
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                                + email + ".jpg", "wb") as f:
                            f.write(pic.content)
                            f.close()
                    except:
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                                "a") as f:
                            f.write(email + " : " + img_url)
                            f.close()
                browser.quit()
        else:
            continue

示例#4

0

显示文件

def getInfo(url, frontstr, org="Kansas State University"):
    try:
        res = fetch(url)
    except:
        return getInfo(url, frontstr, org)
    tmp = extract("//tbody/tr/td", res, True)
    # print(tmp)
    major = extract("//div[@id='ksu-unitbar']/h2/a/text()", res)

    for each in tmp:
        source = str(etree.tostring(each))

        # print(source)
        email = extract("//a[contains(@href, '@')]/text()", source)
        if not email:
            continue
        name = extract("//strong/text()", source)
        if not name:
            name = extract("//strong/a/span/text()", source)
        web_url = extract("//a[contains(@href, '/people')]/@href", source)
        # print(web_url)
        img_url = ""
        if web_url:
            web_url = frontstr + web_url
            try:
                text = fetch(web_url)
                img_url = extract("//img[contains(@src, '/docs/people')]/@src",
                                  text)
            except:
                pass
        else:
            continue
        print(frontstr + str(img_url), " ", name, " ", email, " ", web_url,
              " ", major, " ", org)
        if img_url:
            try:
                img_url = frontstr + img_url
                pic = requests.Session().get(img_url, timeout=30)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + img_url + "\n")
        user = People(email=email,
                      name=name,
                      major=major,
                      web=web_url,
                      orginazation=org)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#5

0

显示文件

文件： eng_sdsu_parser.py 项目： leisun123/scholar-private

def get_info(url, major, org="San Diego State University"):
    global img_url
    try:
        browser = webdriver.Chrome()
        # html = fetch(url)
        browser.get(url)
        # tmp = extract("//div[@class='container']/table[@width='100%']/tbody/tr", html, True)
        tmp = browser.find_elements_by_xpath("//table[@width='100%']/tbody/tr")
        # print(tmp)

    except Exception as e:
        print(e)
        return get_info(url, major)

    for i in tmp:
        # each = str(etree.tostring(i))
        each = i.get_attribute('innerHTML')
        # print(each)
        img = extract("//td[2]/img/@src", each)
        if img:
            img_url = "http://ccee.sdsu.edu" + img
        # print(img)
        web_url = extract("//td[5]/a/@href", each)
        if not web_url:
            web_url = ""
        name = extract("//td[3]/p/text()", each)
        email = extract("//td[6]/a/@href", each)
        if email:
            email = email.split(':')[1]
        else:
            continue
        print(name, email, major, web_url, img_url, org)
        user = People(email=email, name=name, major=major, web=web_url, orginazation=org)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)
        if img_url is not None and email is not None:
            # img = "http://be.utdallas.edu" + img_url
            # print(img_url)
            try:
                pic = requests.Session().get(img_url, timeout=300)

                with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg",
                          "wb") as f:
                    f.write(pic.content)
                    f.close()
            except Exception as e:
                print(e)
                print(img_url)
                with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f:
                    f.write(email + " : " + img_url + "\n")
                    f.close()

示例#6

0

显示文件

文件： engr_CUNY_parser.py 项目： leisun123/scholar-private

def get_info(url, org="CUNY--City College (Grove)"):
    global img_url
    # browser = webdriver.Chrome(chrome_options=SelenuimParser(stopimage=2, stopjs=2))
    browser = webdriver.Chrome()
    try:
        browser.get(url)
    except Exception as e:
        print(e)
        browser.execute_script('window.stop()')
    finally:
        tmp = browser.find_elements_by_xpath("//ul[@id='list-staff']/li")

    for i in tmp:
        each = i.get_attribute('innerHTML')
        web = extract("//h3[@class='desktop']/a/@href", each)
        if web:
            web_url = "https://www.ccny.cuny.edu" + web
        else:
            web_url = ""
        email = extract("//a[contains(@href, '@')]/text()", each)
        # print(email)
        name = extract("//h3[@class='desktop']/a/text()", each)
        img_url = extract("//div[@class='lFloatGraphic']/a/img/@src", each)
        major = extract("//div[@class='inner2 floatLeft']/h4/text()", each)
        print(name, email, major, web_url, img_url, org)

        user = People(email=email,
                      name=name,
                      major=major,
                      web=web_url,
                      orginazation=org)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

        if img_url is not None and email is not None:
            # img = "http://be.utdallas.edu" + img_url
            print(img_url)
            try:
                pic = requests.Session().get(img_url, timeout=30)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + img_url)
                    f.close()
    browser.quit()

示例#7

0

显示文件

文件： cb_SUNY_parser.py 项目： leisun123/scholar-private

def getInfo(url, major):
    try:
        res = fetch(url)
    except:
        return getInfo(url, major)
    tmp = extract("//div[@class='profilepage unstructuredpage page']", res,
                  True)

    for each in tmp:
        source = str(etree.tostring(each))
        try:
            # print(source)
            url = extract(
                "//div[@class='profileinfo-teaser-photo']/picture/img/@src",
                source)
            if url is not None:
                pic_url = "http://engineering.buffalo.edu/" + url
            else:
                pic_url = ""
            email = extract("//a[@class='longtext']/@href",
                            source).split(':')[-1]
            name = extract("//a[@class='title']/b/text()", source)
            web_url = extract("//a[@class='title']/@href", source)
            if web_url is not None:
                web = "http://engineering.buffalo.edu/" + web_url
            else:
                web = ""
            print(pic_url, " ", name, " ", email, " ", web, " ", major)

            # sheet.write(n, 0, name)
            # sheet.write(n, 1, email)
            # sheet.write(n, 2, major)
            # sheet.write(n, 3, web)
            # book.save("/Users/sunlei/scholar-private/out_of_data_module/scholar.xls")
            try:
                pic = requests.Session().get(pic_url, timeout=20)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + pic_url + "\n")
            user = People(email=email, name=name, major=major, web=web)
            session.add(user)
            try:
                session.commit()
            except:
                session.rollback()
            time.sleep(1)
        except Exception as e:
            print(e)

示例#8

0

显示文件

def get_info(url, major):
    global img_url
    try:
        html = fetch(url)
        tmp = extract('//*[@id="maincontent"]/div/div/table/tbody/tr', html,
                      True)
        # print(tmp)
    except:
        return get_info(url, major)

    for i in tmp:
        each = str(etree.tostring(i))
        # print(each)
        title = extract("//td[3]/div[2]/text()", each)
        # print(title)
        if title and "Professor" not in str(title):
            continue
        name = extract("//a[contains(@href, '/eng/bae')]/text()", each)
        # if not name:
        #     continue
        img = extract("//img/@src", each)
        if img:
            img_url = "https://www.lsu.edu" + img
        email = extract("//a[contains(@href, '@')]/text()", each)
        if not email:
            continue
        web = extract("//a[contains(@href, '/eng/bae')]/@href", each)
        if web:
            web_url = "https://www.lsu.edu" + web
        else:
            web_url = ""
        print(name, email, major, web_url, img_url, title)
        user = People(email=email, name=name, major=major, web=web_url)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)
        if img_url is not None and email is not None:
            # img = "http://be.utdallas.edu" + img_url
            print(img_url)
            try:
                pic = requests.Session().get(img_url, timeout=30)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + img_url)
                    f.close()

示例#9

0

显示文件

def getInfo(url, major, org="New Mexico State University"):
    try:
        res = fetch(url)
    except:
        return getInfo(url, major, org)
    tmp = extract("//div[@class='entry-content']/table[1]/tbody/tr", res, True)
    # print(tmp)

    for each in tmp:
        source = str(etree.tostring(each))

        # print(source)
        email = extract("//a[contains(@href, '@')]/@href", source)
        if not email:
            continue
        else:
            email = email.split(':')[1]
        fullname = extract("//td[1]/a[text()]/text()", source)
        if fullname:
            web_url = extract("//td[1]/a/@href", source)
            name = fullname.split(',')[1] + " " + fullname.split(',')[0]
        else:
            # fullname = extract("//td[1]/a[text()]/text()", source)
            name = ""
        # name = fullname.split(',')[1] + " " + fullname.split(',')[0]
        # print(web_url)
        img_url = extract("//td[1]//img/@src", source)
        # print(name)
        print(img_url, " ", name, " ", email, " ", web_url, " ", major, " ",
              org)
        if img_url:
            try:
                pic = requests.Session().get(img_url, timeout=30)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + img_url + "\n")
        user = People(email=email,
                      name=name,
                      major=major,
                      web=web_url,
                      orginazation=org)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#10

0

显示文件

文件： che_LSU_parser.py 项目： leisun123/scholar-private

def get_info(url, major):
    global img_url
    try:
        html = fetch(url)
        tmp = extract('//div[@class="col-md-9"]/div[@class="col-md-3"]', html,
                      True)
        # print(tmp)
    except:
        return get_info(url, major)

    for i in tmp:
        each = str(etree.tostring(i))
        web = extract("//p/a[1]/@href", each)
        if web:
            name = extract("//p/a[1]/@title", each)
            web_url = "https://www.lsu.edu" + web
            img = extract("//img/@src", each)
            if img:
                img_url = "https://www.lsu.edu" + img
            try:
                source = fetch(web_url)
                email = extract("//a[contains(@href, '@')]/text()", source)
                if not email:
                    continue
                print(name, email, major, web_url, img_url)
                user = People(email=email, name=name, major=major, web=web_url)
                session.add(user)
                try:
                    session.commit()
                except:
                    session.rollback()
                time.sleep(1)
                if img_url is not None and email is not None:
                    # img = "http://be.utdallas.edu" + img_url
                    # print(img_url)
                    try:
                        pic = requests.Session().get(img_url, timeout=30)
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                                + email + ".jpg", "wb") as f:
                            f.write(pic.content)
                            f.close()
                    except:
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                                "a") as f:
                            f.write(email + " : " + img_url)
                            f.close()
            except:
                print(name + "Failed")
                pass
        else:
            continue

示例#11

0

显示文件

def getInfo(url, major):
    try:
        res = fetch(url)
    except:
        return getInfo(url, major)
    tmp = extract("//div[@class='faculty-directory']/p", res, True)

    for each in tmp:
        source = str(etree.tostring(each))

        # print(source)
        url = extract("//strong/img/@src", source)
        # print(url)
        if url is not None:
            pic_url = "http://www.uta.edu/ee/" + url.split("./")[-1]
        else:
            pic_url = ""
        email = re.search(r'<br/> <a href="(.*?)">', source, re.S)
        if email is not None:
            email = str(email).split(':')[-1].split('"')[0]
        else:
            continue
        name = extract("//strong/img/@alt", source)
        if name:
            name = str(name).split('Dr. ')[-1].split(",")[0]
        else:
            continue
        web_url = re.findall(r'\| <a href="(.*?)">', source, re.S)
        if web_url:
            web = str(web_url[0])
        else:
            web = ""
        print(pic_url, " ", name, " ", email, " ", web, " ", major)
        try:
            pic = requests.Session().get(pic_url, timeout=30)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/pic/" +
                    email + ".jpg", "wb") as f:
                f.write(pic.content)
                f.close()
        except:
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                    "a") as f:
                f.write(email + " : " + pic_url + "\n")
        user = People(email=email, name=name, major=major, web=web)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#12

0

显示文件

文件： cs_wpi.py 项目： leisun123/scholar-private

def getInfo(url, major):
    try:
        browser = webdriver.PhantomJS()
        browser.get(url)
    except:
        return getInfo(url, major)
    tmp = browser.find_elements_by_xpath(
        "//section[@aria-label='directory-table']/div")
    print(len(tmp))
    for each in tmp:
        source = each.get_attribute("innerHTML")

        # print(source)
        pic_url = extract("//div[@class='member-photo']/img/@src", source)
        # print(url)
        if not url:
            pic_url = ""
        email = extract("//*[contains(@href, '@')]/text()", source)
        if email is not None:
            email = str(email).split(':')[-1].split('"')[0]
        else:
            continue
        name = extract("//div[@class='member-photo']/img/@alt", source)
        if not name:
            continue
        web_url = extract("//*[@class='name']/a/@href", source)
        if web_url:
            web = "https://www.wpi.edu/" + web_url
        else:
            web = ""
        print(pic_url, " ", name, " ", email, " ", web, " ", major)
        try:
            pic = requests.Session().get(pic_url, timeout=30)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/pic/" +
                    email + ".jpg", "wb") as f:
                f.write(pic.content)
                f.close()
        except Exception as e:
            print(e)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                    "a") as f:
                f.write(email + " : " + pic_url + "\n")
        user = People(email=email, name=name, major=major, web=web)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#13

0

显示文件

def getInfo(url, major):
    try:
        res = fetch(url)
    except:
        return getInfo(url, major)
    tmp = extract("//div[@class='staffdirectory imagebase section']", res,
                  True)

    for each in tmp:
        source = str(etree.tostring(each))

        # print(source)
        url = extract("//picture/img/@src", source)
        if url is not None:
            pic_url = "http://engineering.buffalo.edu/" + url
        else:
            pic_url = ""
        email = extract("//a[@class='longtext']/@href", source)
        if email is not None:
            email = email.split(':')[-1]
        else:
            continue
        name = extract("//span[@class='staff_name_bolded']/a/text()", source)
        if name is None:
            name = extract("//span[@class='staff_name_bolded']/text()", source)
        web_url = extract("//span[@class='staff_name_bolded']/a/@href", source)
        if web_url is not None:
            web = "http://engineering.buffalo.edu/" + web_url
        else:
            web = ""
        print(pic_url, " ", name, " ", email, " ", web, " ", major)
        try:
            pic = requests.Session().get(pic_url, timeout=30)
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/pic/" +
                    email + ".jpg", "wb") as f:
                f.write(pic.content)
                f.close()
        except:
            with open(
                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                    "a") as f:
                f.write(email + " : " + pic_url + "\n")
        user = People(email=email, name=name, major=major, web=web)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)

示例#14

0

显示文件

def get_info(url, org="University of Georgia"):
    global img_url
    try:
        html = fetch(url)
        tmp = extract('//div[@class="people-list"]/article', html, True)
        # print(tmp)
    except:
        return get_info(url)

    for i in tmp:
        each = str(etree.tostring(i))
        img = extract("//div[@class='photo']/@style", each)
        # print(img)
        if not img:
            img_url = ""
        else:
            # name = extract("//a/img/@alt", each).split(' photo')[0]
            img_url = img.split('url(')[1].split(');')[0]
            web_url = "http://www.engr.uga.edu" + extract("//a[@class='content']/@href", each)
            # print(web_url)
            try:
                source = fetch(web_url)
                email = extract("//a[contains(@href, '@')]/text()", source)
                name = extract("//div[@class='col-sm-8']/h1/text()", source).split(',')[0]
                major = extract("//ul[@class='categories']/li[1]/a/text()", source)
                print(name, email, major, web_url, img_url, org)
                user = People(email=email, name=name, major=major, web=web_url, orginazation=org)
                session.add(user)
                try:
                    session.commit()
                except:
                    session.rollback()
                time.sleep(1)
                if img_url is not None and email is not None:
                    # img = "http://be.utdallas.edu" + img_url
                    print(img_url)
                    try:
                        pic = requests.Session().get(img_url, timeout=30)
                        with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg",
                                  "wb") as f:
                            f.write(pic.content)
                            f.close()
                    except:
                        with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f:
                            f.write(email + " : " + img_url)
                            f.close()
            except Exception as e:
                print(web_url)
                print(e)
                pass

示例#15

0

显示文件

def get_info(url, major):
    global img_url
    try:
        html = fetch(url)
        tmp = extract('//div[@class="col-md-12"]/p', html, True)
        # print(tmp)
    except:
        return get_info(url, major)

    for i in tmp:
        each = str(etree.tostring(i))
        img = extract("//a/img/@src", each)
        # print(img)
        if not img:
            continue
        else:
            # name = extract("//a/img/@alt", each).split(' photo')[0]
            img_url = "https://www.lsu.edu" + img
            web_url = "https://www.lsu.edu" + extract("//a/@href", each)
            # print(web_url)
            try:
                source = fetch(web_url)
                email = extract("//a[contains(@href, '@')]/text()", source)
                name = extract("//div[@class='col-md-12']/h2/text()", source).split(',')[0]
                print(name, email, major, web_url, img_url)
                user = People(email=email, name=name, major=major, web=web_url)
                session.add(user)
                try:
                    session.commit()
                except:
                    session.rollback()
                time.sleep(1)
                if img_url is not None and email is not None:
                    # img = "http://be.utdallas.edu" + img_url
                    print(img_url)
                    try:
                        pic = requests.Session().get(img_url, timeout=30)
                        with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg",
                                  "wb") as f:
                            f.write(pic.content)
                            f.close()
                    except:
                        with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f:
                            f.write(email + " : " + img_url)
                            f.close()
            except Exception as e:
                print(web_url)
                print(e)
                pass

示例#16

0

显示文件

文件： purdue_parser.py 项目： leisun123/scholar-private

def get_info(url, major):
    global img_url
    try:
        html = fetch(url)
        tmp = extract("//div[@class='people-text']/h4/a/@href", html, True)
    except:
        return get_info(url, major)

    for each in tmp:
        if each:
            web_url = "http://www.engr.iupui.edu" + each
            res = fetch(web_url)
            img = extract("//div[@class='inset-right faculty-photo']/img/@src",
                          res)
            if img:
                img_url = "http://www.engr.iupui.edu" + img
            name = extract("//div[@class='caption']/h5/text()", res)
            email = extract("//div[@class='caption']/p/text()[3]", res)
            if email:
                email = email.strip()
            else:
                continue
            print(name, email, major, web_url, img_url)
            user = People(email=email, name=name, major=major, web=web_url)
            session.add(user)
            try:
                session.commit()
            except:
                session.rollback()
            time.sleep(1)
            if img_url is not None and email is not None:
                # img = "http://be.utdallas.edu" + img_url
                print(img_url)
                try:
                    pic = requests.Session().get(img_url, timeout=30)
                    with open(
                            "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                            + email + ".jpg", "wb") as f:
                        f.write(pic.content)
                        f.close()
                except:
                    with open(
                            "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                            "a") as f:
                        f.write(email + " : " + img_url)
                        f.close()

示例#17

0

显示文件

def get_info(url, major, org):
    global img_url
    try:
        # html = fetch(url)
        browser = webdriver.Chrome(chrome_options=SelenuimParser(stopimage=2, stopjs=2))
        # chrome_options = Options()
        # prefs = {
        #     'profile.default_content_setting_values': {
        #         'images': 2,
        #         'javascript': 2
        #     }
        # }
        # chrome_options.add_experimental_option('prefs', prefs)
        # chrome_options.add_argument('--headless')
        # browser = webdriver.Chrome(chrome_options=chrome_options)
            # browser.get(web_url)
        browser.get(url)
        tmp = browser.find_elements_by_xpath("//div[@class='col-sm-3 col-md-3 col-xs-6 uark-unify-heights']")
        # print(tmp)
    except:
        return get_info(url, major, org)

    for i in tmp:
        source = i.get_attribute("innerHTML")
        # print(source)
        email = extract("//div[@class='email']/a/text()", source)
        if not email:
            continue
        web = extract("//a[1]/@href", source)
        name = extract("//div[@class='name']/text()", source)
        if web:
            web_url = "https:" + web
        else:
            web_url = ""
        img = extract("//img[@class='img-responsive img-thumbnail']/@src", source)
        if img:
            img_url = "https:" + img
            # print(img_url)
        else:
            img_url = ""
        print(name, email, major, web_url, img_url, org)
        user = People(email=email, name=name, major=major, web=web_url, orginazation=org)
        session.add(user)
        try:
            session.commit()
        except Exception as e:
            print(e)
            session.rollback()
        time.sleep(1)
        if img_url is not None and email is not None:
            # img = "http://be.utdallas.edu" + img_url
            # print(img_url)
            try:
                pic = requests.Session().get(img_url, timeout=30)
                with open("/Users/sunlei/scholar-private/out_of_data_module/pic/" + email + ".jpg",
                          "wb") as f:
                    f.write(pic.content)
                    f.close()
            except:
                with open("/Users/sunlei/scholar-private/out_of_data_module/timeout.txt", "a") as f:
                    f.write(email + " : " + img_url + "\n\n")
                    f.close()
    browser.quit()
    time.sleep(random.choice(range(2,5)))

示例#18

0

显示文件

文件： egr_vcu_parser.py 项目： leisun123/scholar-private

def get_info(url, major, org="Virginia Commonwealth University"):
    global img_url
    browser = webdriver.Chrome(
        chrome_options=SelenuimParser(stopimage=2, stopjs=1))
    # chrome_options = Options()
    # prefs = {
    #             'profile.default_content_setting_values': {
    #         'images': 2,
    #         'javascript': 1
    #             }
    # }
    # chrome_options.add_experimental_option('prefs', prefs)
    # chrome_options.add_argument('--headless')
    # browser = webdriver.Chrome(chrome_options=chrome_options)
    # browser = webdriver.Chrome()
    try:
        browser.get(url)
        # print(tmp)

    except Exception as e:
        print(e)
        browser.execute_script('window.stop()')

    finally:
        tmp = browser.find_elements_by_xpath('//div[@class="expert clearfix"]')
        # print(browser.page_source)

    for i in tmp:
        each = i.get_attribute('innerHTML')
        # print(each)
        img_url = extract("//img/@src", each)
        # print(img)
        web_url = extract("//h4/a/@href", each)
        name = extract("//h4/a/text()", each)
        if name:
            name = name.split(',')[0]
        else:
            continue
        email = extract("//a[contains(@href, '@')]/text()", each)
        print(name, email, major, web_url, img_url, org)
        user = People(email=email,
                      name=name,
                      major=major,
                      web=web_url,
                      orginazation=org)
        session.add(user)
        try:
            session.commit()
        except:
            session.rollback()
        time.sleep(1)
        if img_url is not None and email is not None:
            # img = "http://be.utdallas.edu" + img_url
            # print(img_url)
            try:
                pic = requests.Session().get(img_url, timeout=300)

                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                        + email + ".jpg", "wb") as f:
                    f.write(pic.content)
                    f.close()
            except Exception as e:
                print(e)
                print(img_url)
                with open(
                        "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                        "a") as f:
                    f.write(email + " : " + img_url + "\n")
                    f.close()
    browser.quit()
    time.sleep(3)

示例#19

0

显示文件

def get_info(url):
    global img_url, browser
    try:
        # res = fetch(url)
        browser = webdriver.Chrome()
        # print(res)
        browser.get(url)
    except Exception as e:
        browser.execute_script('window.stop()')
        print(e)
    finally:
        dean_tmp = browser.find_elements_by_xpath(
            '//div[@class="row multi layout-staff"][1]/div[@class="col-sm-4"]')
        cue_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[2]/div[@class="col-sm-4"]')
        cme_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[3]/div[@class="col-sm-4"]')
        cee_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[5]/div[@class="col-sm-4"]')
        ece_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[7]/div[@class="col-sm-4"]')
        iseem_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[9]/div[@class="col-sm-4"]')
        mae_tmp = browser.find_elements_by_xpath(
            '//*[@id="articleBody"]/div[11]/div[@class="col-sm-4"]')
        info = {
            "Office of the Dean":
            dean_tmp,
            "Center for Undergraduate Engineering Education":
            cue_tmp,
            "Chemical & Materials Engineering Department":
            cme_tmp,
            "Civil & Environmental Engineering Department":
            cee_tmp,
            "Electrical & Computer Engineering Department":
            ece_tmp,
            "Industrial & Systems Engineering and Engineering Management Department":
            iseem_tmp,
            "Mechanical & Aerospace Engineering Department":
            mae_tmp
        }
    exception = [
        'Office of the Dean', 'Center for Undergraduate Engineering Education',
        'Mechanical & Aerospace Engineering Department'
    ]
    for each in info.keys():
        for i in info[each]:
            # res = str(etree.tostring(i))
            res = i.get_attribute("innerHTML")
            # print(res)
            major = each
            web = extract("//a[contains(@href,'departments')]/@href", res)
            if web:
                if "https://" in web:
                    web_url = web
                else:
                    web_url = "https://www.uah.edu" + web
            else:
                web_url = ""
            img = extract("//div[@class='image-holder']/img/@src", res)
            if img:
                img_url = "https://www.uah.edu" + img
            name = extract("//div[@class='image-holder']/img/@alt", res)
            if not name:
                continue
            email = extract("//a[contains(@href,'@uah.edu')]/text()", res)
            if not email:
                continue
            print(name, " ", email, " ", web_url, " ", img_url, " ", major)
            user = People(email=email, name=name, major=major, web=web_url)
            session.add(user)
            try:
                session.commit()
            except:
                session.rollback()
            time.sleep(1)
            if img_url is not None and email is not None:
                # img = "http://be.utdallas.edu" + img_url
                print(img_url)
                try:
                    pic = requests.Session().get(img_url, timeout=30)
                    with open(
                            "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                            + email + ".jpg", "wb") as f:
                        f.write(pic.content)
                        f.close()
                except:
                    with open(
                            "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                            "a") as f:
                        f.write(email + " : " + img_url)
                        f.close()

示例#20

0

显示文件

def get_info(url, major, org="Embry-Riddle Aeronautical University"):
    browser = webdriver.Chrome(
        chrome_options=SelenuimParser(stopimage=2, stopjs=1))
    try:
        browser.get(url)
    except:
        browser.execute_script('window.stop()')
    finally:
        tmp = browser.find_elements_by_class_name('media')
        # print(tmp)
        for each in tmp:
            res = each.get_attribute('innerHTML')
            # print(res)
            img_url = extract("//img[@class='media-object']/@src", res)
            if not img_url:
                name = extract("//h4[@class='media-heading name']/text()", res)
            else:
                name = extract("//img[@class='media-object']/@alt", res)
            web = ""
            id = extract('//div/a[@data-toggle="modal"]/@data-faculty-uid',
                         res)
            if id:
                js_url = 'https://webforms.erau.edu/common/services/peoplesearch/faculty.cfc?callback=jQuery1113072479908569549_1541990949502&method=getFacultyByUid&returnformat=plain&uidList=' + id
                try:
                    text = fetch(js_url)
                    email = re.findall(r'"EMAIL":"(.*?)",', text, re.S)[0]
                    print(email, name, web, img_url, major, org)
                    user = People(email=email,
                                  name=name,
                                  major=major,
                                  web=web,
                                  orginazation=org)
                    session.add(user)
                    try:
                        session.commit()
                    except Exception as e:
                        print(e)
                        session.rollback()
                    time.sleep(1)
                    if img_url is not None and email is not None:
                        # img = "http://be.utdallas.edu" + img_url
                        # print(img_url)
                        try:
                            pic = requests.Session().get(img_url, timeout=30)
                            with open(
                                    "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                                    + email + ".jpg", "wb") as f:
                                f.write(pic.content)
                                f.close()
                        except:
                            with open(
                                    "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                                    "a") as f:
                                f.write(email + " : " + img_url + "\n\n")
                                f.close()
                except Exception as e:
                    print(e)
                    pass
            else:
                continue
        browser.quit()
        time.sleep(random.choice(range(2, 5)))

示例#21

0

显示文件

文件： eng_FAMU-FSU_parser.py 项目： leisun123/scholar-private

def get_info(url, org="Florida A&M University - Florida State University"):
    browser = webdriver.Chrome(
        chrome_options=SelenuimParser(stopimage=2, stopjs=2))
    # browser = webdriver.Chrome()
    try:
        browser.get(url)
    except:
        browser.execute_script('window.stop()')
    finally:
        tmp = browser.find_elements_by_xpath(
            "//div[contains(@class, 'col-xs-6 col-sm-4 col-md-3 col-lg-2 views-col')]"
        )
        # print(tmp)
        for each in tmp:
            res = each.get_attribute('innerHTML')
            # print(res)
            web_url = extract("//div[@class='bio-photo']/div/a/@href", res)
            # print(web_url)
            if web_url:
                web = "https://eng.famu.fsu.edu" + web_url
            else:
                continue
            major = extract(
                "//div[@class='views-field views-field-field-department-s-']/div/text()",
                res)
            print(major)
            if major not in major_list:
                continue
            img_url = extract("//div[@class='bio-photo']/div/a/img/@src", res)
            if not img_url:
                name = extract(
                    "//h4[@class='views-field views-field-title faculty-name'/]span/a/text()",
                    res).split(',')[0]
            else:
                name = extract("//div[@class='bio-photo']/div/a/img/@title",
                               res).split(',')[0]
            try:
                text = fetch(web)
                email = extract('//div[contains(text(), "@")]/text()', text)
                if not email:
                    continue
                print(email, name, web, major, org)
                user = People(email=email,
                              name=name,
                              major=major,
                              web=web,
                              orginazation=org)
                session.add(user)
                try:
                    session.commit()
                except Exception as e:
                    print(e)
                    session.rollback()
                time.sleep(random.choice(range(0, 3)))
                if img_url is not None and email is not None:
                    img = "https://eng.famu.fsu.edu" + img_url
                    print(img)
                    try:
                        pic = requests.Session().get(img, timeout=30)
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/pic/"
                                + email + ".jpg", "wb") as f:
                            f.write(pic.content)
                            f.close()
                    except:
                        with open(
                                "/Users/sunlei/scholar-private/out_of_data_module/timeout.txt",
                                "a") as f:
                            f.write(email + " : " + img_url + "\n\n")
                            f.close()
            except Exception as e:
                print(e)
                pass
        browser.quit()