def getCsvUrl(soup, doc, id):
    #
    url = ' '
    try:
        csv = soup.find('csv')
        url = csv.a['href']
    except Exception:
        print('error:' + str(csv))
        path = Dir_path + csvError_path + str(id) + '.txt'
        writeTXT(path, doc)
    return url
def getSubject(soup, doc, id):
    #
    url = ' '
    stri = soup.find(text='See all subject areas')  #找到目标字样
    if stri != None:
        try:
            url = 'http://dl.acm.org/' + stri.parent['href']
        except Exception:
            print('error:' + str(stri.parent))
            path = Dir_path + subjError_path + str(id) + '.txt'
            writeTXT(path, doc)
    return url
def getColleagues(soup, doc, id):
    #
    url = ' '
    stri = soup.find(text='See all colleagues of this author')  #找到目标字样
    if stri != None:
        try:
            url = 'http://dl.acm.org/' + stri.parent['href']
        except Exception:
            print('error:' + str(stri.parent))
            path = Dir_path + collError_path + str(id) + '.txt'
            writeTXT(path, doc)

    return url
def getAdvisor(soup, doc, id):
    #
    url = ' '
    stri = soup.find(text='Advisor only')  #找到目标字样
    if stri != None:
        try:
            url = 'http://dl.acm.org/' + stri.parent['href']
        except Exception:
            print('error:' + str(stri.parent))
            path = Dir_path + advError_path + str(id) + '.txt'
            writeTXT(path, doc)

    return url
def getSubject(soup, doc, id):
    #
    url = ' '
    try:
        left = soup.find('left')
        subject = left.find('See all subject areas')
        if subject != None:
            url = subject.parent['href']
    except Exception:
        print('error:' + str(subject))
        path = Dir_path + subjError_path + str(id) + '.txt'
        writeTXT(path, doc)
    return url
def getColleagues(soup, doc, id):
    #
    url = ' '
    try:
        left = soup.find('left')
        colleagues = left.find('See all colleagues of this author')
        if colleagues != None:
            url = colleagues.parent['href']
    except Exception:
        print('error:' + str(colleagues))
        path = Dir_path + collError_path + str(id) + '.txt'
        writeTXT(path, doc)
    return url
def getAdvisor(soup, doc, id):
    #
    url = ' '
    try:
        left = soup.find('left')
        Advisor = left.find('Advisor only')
        if Advisor != None:
            url = Advisor.parent['href']
    except Exception:
        print('error:' + str(Advisor))
        path = Dir_path + advError_path + str(id) + '.txt'
        writeTXT(path, doc)
    return url
Пример #8
0
def getSubject(soup, doc, id):
    #
    url = ' '
    stri = soup.find(text='See all subject areas')  #找到目标字样
    if stri != None:
        try:
            url = 'http://dl.acm.org/' + stri.parent['href']
        except Exception:
            print('error:' + str(stri.parent))
            path = Dir_path + subjError_path + str(id) + '.txt'
            writeTXT(path, doc)
    '''
    for a in aTag:
		if a.string!='' and a.string == 'See all subject areas':
			url = 'http://dl.acm.org/' + a['href']
			break
    '''
    #print('subject'+url)
    return url
Пример #9
0
def getColleagues(soup, doc, id):
    #
    url = ' '
    stri = soup.find(text='See all colleagues of this author')  #找到目标字样
    if stri != None:
        try:
            url = 'http://dl.acm.org/' + stri.parent['href']
        except Exception:
            print('error:' + str(stri.parent))
            path = Dir_path + collError_path + str(id) + '.txt'
            writeTXT(path, doc)
    '''
    for a in aTag:
        if a.string!='' and a.string!=None and 'See all colleagues' in a.string:
        #if a.string!='' and a.string=='See all colleagues of this author':
            url = 'http://dl.acm.org/' + a['href']
            break
    '''
    #print(url)
    return url
def analysisPage(doc,id):
    nameLink = []
    #total = 0
    soup = BeautifulSoup(''.join(doc),"lxml")
    a = soup.find('a',{'name':"collab"})
    try:
        divAb = a.parent.parent
        tr = divAb.table.tr
        for td in tr.findAll('td'):
            for div in td.findAll('div'):
                if div.a.string != None:
                    name = cleanName(div.a.string)

                    url = 'http://dl.acm.org/' + div.a['href']
                    url,userID = extractUserID(url)

                    #if checkSame(url,userID)==False:#不存在相同
                    nameLink.append([name,url,userID])                        
    except Exception:
        print('error:'+str(a))
        path = Dir_path+httpError_path+str(id)+'.txt'
        writeTXT(path,doc)
    return nameLink
    headers = commHeaders.copy()
    cookies = commCookies.copy()

    while not dlQueue.empty():

        dl = dlQueue.get()
        http = random.choice(http)
        ua = random.choice(uag)
        httpProxies['https'] = http
        #ts1 = datetime.datetime.now()
        #修饰参数
        if ChangeOrNot() == True:  #随机触发
            headers = editeHeader(ua, headers, dl['name'])  #改变user agent
            cookies = editeCookies(cookies)
        time.sleep(random.randint(5, 20))  #随机休眠

        #取出html
        html = str(getPage(dl['url'], httpProxies, headers, cookies))  #取出url

        #放回
        if html == ' ':  #未获取成功,重新放入
            dlQueue.put(dl)
        #放入文件中
        try:
            path = dirPath + str(dl['id']) + '.txt'
            writeTXT(path, html)
            print('complete: ' + str(dl['id']))
        except Exception:
            print('error:' + str(dl['id']))
            dlQueue.put(dl)