示例#1
0
def add_audit(domain):
    get.dgs('https://www.semrush.com/projects/')
    #ADD DOMAIN
    get.dec('s-btn -xs -primary js-add-project')
    get.dsk(domain, 'js-input-domain')
    get.dsk(domain, 'js-input-name')
    get.dec('s-btn -s -success js-save-pr')
    while not get.de('setup', clas='data-action'):
        get.sleep(get.randint(3, 7))
        get.log('> Waiting for setup button')
    get.dec('setup', clas='data-action')
    while 'Audit' not in get.det('s-btn__text', index=-1):
        get.sleep(get.randint(3, 7))
        get.log('> Waiting for setup audit button')
    get.dec('s-btn__text', index=-1)
    counter = 0
    max_loop = 20
    try:  #WAIT FOR PROGRESS BAR
        while not get.de('s-widget__progress-title'):
            get.sleep(get.randint(3, 7))
            get.log('> Waiting for progress bar ' + str(counter * 3) + 's')
            if counter == max_loop:
                get.log('> ')
                break
            error_btn = get.de('s-btn -danger -xs')
            if error_btn:
                error_btn.click()
            counter += 1
    except:
        get.pe()
    return get_project_id(get.DR.current_url), counter < max_loop
示例#2
0
def soups(rows, folder, t_id):
    for row in rows:
        try:
            get.log('> GET ' + row['URL'])
            rq = get.get(row['URL'])
            folders = row['URL'].split('//')[1].split('/')[:-1]
            for i, folder in enumerate(folders):
                if i == 0: continue
                row['FOLDER LEVEL ' + str(i)] = folder
            row['DEPTH'] = len(folders)
            row['FINAL URL'] = rq.url
            row['FINAL URL LENGTH'] = len(rq.url)
            row['HEADERS'] = rq.headers
            row['CONTENT TYPE'] = rq.headers[
                'Content-Type'] if 'Content-Type' in rq.headers else ''
            #row['CONTENT LENGTH']=rq.headers['Content-Length'] if 'Content-Length' in rq.headers else ''
            #row['SERVER']=rq.headers['Server']
            row['DATE'] = rq.headers['Date']
            #row['ENCODE']=rq.encoding
            row['RESPONSE TIME (MS)'] = rq.elapsed.microseconds / 1000
            row['REDIRECT TYPE'] = 'NONE' if not rq.is_redirect else 'PERMANENT' if rq.is_permanent_redirect else 'TEMPERARY'
            row['REDIRECT'] = rq.is_redirect
            row['STATUS'] = rq.reason
            row['STATUS CODE'] = rq.status_code
            row['HTTP/HTTPS'] = 'HTTPS' if 'https' in rq.url[:10] else 'HTTP'
            row['SOUP'] = get.bs(rq.content, 'html.parser')
            check('title', row)
            check('h1', row)
            check('h2', row)
            check_meta(row)
            check_img(row)
            check_canonical_link(row)
        except:
            get.pe()
示例#3
0
def check(tag, row):
    try:
        for i, _tag in enumerate(row['SOUP'].findAll(tag)):
            key = tag.upper()
            if 'h' in tag: key = key + '-' + str(i + 1)
            row[key] = _tag.text
            _len = len(row[key])
            row[key + ' LENGTH'] = _len
            row[key + ' STATUS'] = check_length_status(_len, tag)
    except:
        get.pe(str(_tag))
示例#4
0
 def get_data_from_table(index=0):
     tbl = get.be('backgrid', 'table', index=index)
     if not tbl:
         get.log("> There is no table data")
         return
     cols = [e.text.strip()
             for e in tbl.thead.findAll('th')][1:]  #column names
     #get.log('> Columns: '+str(cols))
     rows = []
     for tr in [tr for tr in tbl.tbody.findAll('tr')]:
         try:
             row = {}
             tds = tr.findAll('td')[1:]
             for i, td in enumerate(tds):
                 if cols[i] == 'Domain Name':  #domain name column
                     row[cols[i]] = td.text
                 elif cols[i] == 'Overlap':
                     if td.a:  #overlap column
                         overlap = td.a.div['style'].replace('width: ',
                                                             '').replace(
                                                                 '%', '')
                         row[cols[i]] = float(overlap)
                     else:
                         if td.i:  # rank change column
                             text = td.text.split('(')
                             row['Rank'] = text[0]
                             row['Rank Change'] = text[1].replace(
                                 ')', '') if len(text) > 1 else ''
                         else:
                             row[cols[i]] = -1
                 elif cols[i] == 'Keyword':
                     keyword = get.bec(td)
                     row[cols[i]] = keyword[0].text
                     row[cols[i] + ' URL'] = keyword[1].text if len(
                         keyword) > 1 else ''
                 elif cols[i] == 'Ad Timeline':
                     keyword = get.bec(td)
                     row[cols[i]] = keyword[0].text
                 elif len(cols[i]) > 0:  #other column
                     if '(' in td.text:
                         text = td.text.split('(')
                         row['Rank'] = int(text[0])
                         text[1] = text[1].replace(')', '')
                         row[cols[i]] = int(
                             text[1]) if text[1].isdigit() else 0
                     else:
                         row[cols[i]] = get.s2f(td.text)
         except:
             get.pe()
         rows.append(row)
     return get.DataFrame(rows)
示例#5
0
def check_canonical_link(row):
    try:
        for i, link in enumerate(row['SOUP'].findAll('link')):
            if not link.has_attr('rel'): continue
            for i in link['rel']:
                if i.lower() == 'canonical':
                    key = i.upper()
                    row[key] = link['href']
                    row[key + ' LENGTH'] = len(row[key])
                    break
            if 'CANONICAL' not in row:
                row['CANONICAL'] = 'None'
                row['CANONICAL LENGTH'] = ''
    except:
        get.pe(str(link))
示例#6
0
def check_dp(df, output, col):
    try:
        if col in df.keys():
            dps = list(df[df.duplicated(col)].groupby(col))
            if len(dps) > 0:
                for i, dp in enumerate(dps):
                    row_text, _dp = dp
                    _dp = df[df[col] == row_text]
                    row = {}
                    row['TAG'] = col
                    row['DUPLICATION TEXT'] = row_text
                    row['NUMBER OF DUPLICATION'] = len(_dp)
                    row['URL'] = '#\n' + '\n'.join(list(_dp['URL']))
                    output.append(row)
    except:
        get.pe(col)
示例#7
0
def check_meta(row):
    try:
        for i, meta in enumerate(row['SOUP'].findAll('meta')):
            if not meta.has_attr('name') or not meta.has_attr('content'):
                continue
            key = meta['name'].lower()
            if key != 'description' and key != 'keywords': continue
            key = 'META ' + key.upper()
            row[key] = meta['content']
            _len = len(row[key])
            row[key + ' LENGTH'] = _len
            #if key=='keywords':
            #row[key+' STATUS'] = check_length_status(_len, 'keywords')
            if key == 'description':
                row[key + ' STATUS'] = check_length_status(_len, 'description')
    except:
        get.pe(str(meta))
示例#8
0
def check_img(row):
    try:
        imgs = row['SOUP'].findAll('img')
        imgs_alt_tag = [img['alt'] for img in imgs if img.has_attr('alt')]
        row['Number of images'.upper()] = len(imgs)
        row['Number of with alt-tag'.upper()] = len(imgs_alt_tag)
        row['Number of without alt-tag'.upper(
        )] = len(imgs) - len(imgs_alt_tag)
        alt_tags = numpy.array([len(img) for img in imgs_alt_tag])
        if len(alt_tags) > 0:
            row['alt-tag Minimum length'.upper()] = alt_tags.min()
            row['alt-tag Average length'.upper()] = alt_tags.mean()
            row['alt-tag Maximum length'.upper()] = alt_tags.max()
        df = get.DataFrame(imgs_alt_tag)
        row['Number of Alt-Tags that are duplicated'.upper()] = len(
            df[df.duplicated])
    except:
        get.pe()
示例#9
0
def delete_project(pid, domain):
    #DELETE DOMAIN FROM PROJECTS LIST
    #IF THERE ARE 5 PROJECT AND THE DOMAIN IS AT #5 THEN
    #WEBDRIVER HEIGHT HAS TO >1100 OR ELSE THE DELETE BUTTON IS NOT VISIBLE = NOT CLICKABLE
    try:
        get.log('> Deleting project: ' + domain + ', PID=' + pid)
        get.dgs('https://www.semrush.com/projects/')
        div = get.de('s-project js-project-' + pid + ' ')
        while not div:
            get.sleep(get.randint(3, 7))
            get.dgs('https://www.semrush.com/projects/')
            div = get.de('s-project js-project-' + pid + ' ')
        div.find_element_by_class_name('sr-infomenu').click()
        content = div.find_element_by_class_name('sr-infomenu-content')
        content.find_elements_by_tag_name('li')[1].click()
        get.dsk(domain, 'Project name', 'placeholder')
        get.dec('s-btn -s -danger js-remove')
        get.log('> Deleted project: ' + domain + ', PID=' + pid)
        return True
    except:
        get.pe('Can not delete the project: ' + domain + ' > PID=' + pid)
示例#10
0
    get.log('> FILTERS: ' + str(FILTERS))
    get.setup(debug=True, driver=True)
    URL = 'https://www.semrush.com/'
    login(URL)
    error_times = 0
    error_allow = 5
    error_domain = None
    for i, domain in enumerate(DOMAINS):
        get.log('> Progress: ' + str(i + 1) + '/' + str(len(DOMAINS)) + ': ' +
                domain)
        pid, success = add_audit(domain)
        if success:
            get.log('> Add audit successed')
            get.save(get_errors(pid),
                     PROJECT + '/' + get.START_TIME + '/' + domain)
        else:
            #Add failed domain to the DOMAINS list if fail
            if not error_domain: error_domain = domain
            elif error_domain == domain: error_times += 1
            else: error_times = 0
            if error_times < error_allow:
                get.log('> Add ' + domain + ' back to project list ' +
                        str(error_times) + ' times')
                DOMAINS.append(domain)
                error_domain = domain
        if not delete_project(pid, domain):
            get.log('> Fatal error encountered.')
            break
except:
    get.pe()
get.quit(PROJECT)
示例#11
0
from ptl import get
get.setup(debug=True, driver=True)
try:
    #your code go here
except: get.pe()
get.quit()