Exemplo n.º 1
0
def web_scrapper(url):
    browser = mechanize.Browser(
    )  # browser or mechanize used to cheat search engine
    browser.set_handle_robots(False)
    browser.addheaders = [('User-agent', 'Mozilla')]
    r = browser.open(url).read()
    i = 0
    soup = BeautifulSoup(r, "html.parser")
    letters = soup.find_all("img")
    lobbying = {}
    l = lsapi('mozscape-295a2fa4c3', '95ef534d72971f96f3fd5776819a50f7')
    mozMetrics = l.urlMetrics('url')
    #print (mozMetrics)
    for element in letters:
        keyset = element.attrs
        x = {}
        if ("alt" in keyset):
            x["alt"] = element["alt"].encode('ascii',
                                             'replace').decode('ascii')
        else:
            x["alt"] = ""
        if ("height" in keyset):
            x["height"] = element["height"].encode('ascii',
                                                   'replace').decode('ascii')
        else:
            x["height"] = 0
        if ("width" in keyset):
            x["width"] = element["width"].encode('ascii',
                                                 'replace').decode('ascii')
        else:
            x["width"] = 0
        x["size"] = getsizes(url)
        #whether to do it this way for src depends on the wepage
        #Obviously for Bing you can just use src without adding url

        x['inbound links'] = mozMetrics['uid']
        x['moz page rank'] = mozMetrics['umrp']
        x['moz sub-domain rank'] = mozMetrics['fmrp']
        if ("src" in keyset):
            lobbying[
                url +
                element["src"].encode('ascii', 'replace').decode('ascii')] = x
#      urllib.urlretrieve(element["src"], os.path.basename(element["src"]))
        else:
            lobbying[url + str(i)] = x
            i = i + 1
    for key in lobbying.keys():
        x = lobbying[key]
        x['inbound links'] = mozMetrics['uid']
        x['moz page rank'] = mozMetrics['umrp']
        x['moz sub-domain rank'] = mozMetrics['fmrp']
        lobbying[key] = x
    return lobbying
def api_info(lobbying):
    l = lsapi('mozscape-295a2fa4c3', '95ef534d72971f96f3fd5776819a50f7')
    for key in lobbying.keys():
        print lobbying[key]['href']
        mozMetrics = l.urlMetrics(lobbying[key]['href'])
        lobbying[key]['inbound links'] = mozMetrics['uid']
        lobbying[key]['moz page rank'] = mozMetrics['umrp']
        lobbying[key]['moz subdomain rank'] = mozMetrics['fmrp']
        mozMetrics_domain = l.urlMetrics(lobbying[key]['domain'])
        lobbying[key]['domain inbound links'] = mozMetrics_domain['uid']
        lobbying[key]['domain page rank'] = mozMetrics_domain['umrp']
        print(lobbying[key])
    return lobbying
Exemplo n.º 3
0
 def moz_url_metrics(mozscape):
     results = []
     guser = werkzeug_cache.get('guser')
     gs = Gspreadsheet(guser.gmail, guser.gpassword, None)
     gs.login()
     ss = gs.gclient.open_by_url(mozscape.gspread_link)
     ws = ss.sheet1
     urls = gs.col_one(ws)
     # FIXME only use the first url at A2, for now
     url = urls[0]
     l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'],
               flask_app.config['MOZSCAPE_API_SECRET_KEY'])
     try:
         # mozscape restriction is NOT to make parallel requests but batch them instead!!!
         now_timestamp = datetime.utcnow()
         nrow = 2
         metrics = l.urlMetrics(url)
         # gspread update cells in row:
         acells = ws.range("B%s:L%s" % (nrow, nrow))
         acells[0].value = metrics['uid']
         acells[1].value = metrics['uu']
         acells[2].value = metrics['ut']
         acells[3].value = metrics['us']
         acells[4].value = metrics['upa']
         acells[5].value = metrics['ueid']
         acells[6].value = metrics['umrp']
         acells[7].value = metrics['umrr']
         acells[8].value = metrics['fmrp']
         acells[9].value = metrics['fmrr']
         acells[10].value = metrics['pda']
         ws.update_cells(acells)
         mr = MozscapeResult.create(name=mozscape.name,
                                    url=url,
                                    uid=metrics['uid'],
                                    uu=metrics['uu'],
                                    ut=metrics['ut'],
                                    us=metrics['us'],
                                    upa=metrics['upa'],
                                    ueid=metrics['ueid'],
                                    umrp=metrics['umrp'],
                                    umrr=metrics['umrr'],
                                    fmrp=metrics['fmrp'],
                                    fmrr=metrics['fmrr'],
                                    pda=metrics['pda'],
                                    timestamp=now_timestamp)
     except Exception as e:
         print("Error: moz_url_metrics:\n%s" % e)
     return len(results)
def web_scrapper(url):
    browser = mechanize.Browser()  # browser or mechanize used to cheat search engine
    browser.set_handle_robots(False)
    browser.addheaders = [("User-agent", "Mozilla")]
    r = browser.open(url).read()
    i = 0
    soup = BeautifulSoup(r, "html.parser")
    letters = soup.find_all("img")
    lobbying = {}
    l = lsapi("mozscape-295a2fa4c3", "95ef534d72971f96f3fd5776819a50f7")
    mozMetrics = l.urlMetrics("url")
    # print (mozMetrics)
    for element in letters:
        keyset = element.attrs
        x = {}
        if "alt" in keyset:
            x["alt"] = element["alt"].encode("ascii", "replace").decode("ascii")
        else:
            x["alt"] = ""
        if "height" in keyset:
            x["height"] = element["height"].encode("ascii", "replace").decode("ascii")
        else:
            x["height"] = 0
        if "width" in keyset:
            x["width"] = element["width"].encode("ascii", "replace").decode("ascii")
        else:
            x["width"] = 0
        x["size"] = getsizes(url)
        # whether to do it this way for src depends on the wepage
        # Obviously for Bing you can just use src without adding url

        x["inbound links"] = mozMetrics["uid"]
        x["moz page rank"] = mozMetrics["umrp"]
        x["moz sub-domain rank"] = mozMetrics["fmrp"]
        if "src" in keyset:
            lobbying[url + element["src"].encode("ascii", "replace").decode("ascii")] = x
        #      urllib.urlretrieve(element["src"], os.path.basename(element["src"]))
        else:
            lobbying[url + str(i)] = x
            i = i + 1
    for key in lobbying.keys():
        x = lobbying[key]
        x["inbound links"] = mozMetrics["uid"]
        x["moz page rank"] = mozMetrics["umrp"]
        x["moz sub-domain rank"] = mozMetrics["fmrp"]
        lobbying[key] = x
    return lobbying
Exemplo n.º 5
0
def get_backlinks(url, mozscapeAPIaccessID, mozscapeAPIkey):
   """ Uses the Mozscape API to retrieve some backlinks on a url.
       Returns a list of urls.
      """
   # mozscape needs http:// out
   if len(url)>=7 and "http://"==url[0:7]:
      url = url[7:]
   elif len(url)>=8 and "https://"==url[0:8]:
      url = url[8:]
   else:
      return None
   
   l = lsapi(mozscapeAPIaccessID, mozscapeAPIkey)
   links = l.links(url, filters=['external', 'nofollow']) 
   result = list();
   for link in links:
       result.append("http://"+link['uu'])
   return result
Exemplo n.º 6
0
 def moz_index_metadata():
     mim = None
     l = lsapi(flask_app.config['MOZSCAPE_API_ACCESS_ID'],
               flask_app.config['MOZSCAPE_API_SECRET_KEY'])
     try:
         now_timestamp = datetime.utcnow()
         try:
             mim = MozscapeIndexMetadata.get(MozscapeIndexMetadata.id == 1)
         except Exception as e:
             mim = MozscapeIndexMetadata()
             mim.timestamp = None
             print(
                 "Error: moz_index_metadata: MozscapeIndexMetadata.get(MozscapeIndexMetadata.id==1)\n%s"
                 % e)
         # do we need to update db or just return mim:
         if mim.timestamp is None or now_timestamp >= mim.next_update:
             metrics = l.index_metadata()
             mim.index_name = metrics['index_name']
             mim.crawl_duration = metrics['crawl_duration']
             mim.external_links_per_page = metrics[
                 'external_links_per_page']
             mim.links_per_page = metrics['links_per_page']
             mim.links = metrics['links']
             mim.plds = metrics['plds']
             mim.fqdns = metrics['fqdns']
             mim.nofollow = metrics['nofollow']
             mim.urls = metrics['urls']
             if str(metrics['locked']) == 'false':
                 mim.locked = False
             else:
                 mim.locked = True
             mim.rel_canonical = metrics['rel_canonical']
             mim.last_update = datetime.fromtimestamp(
                 metrics['last_update'])
             mim.next_update = datetime.fromtimestamp(
                 metrics['next_update'])
             mim.timestamp = now_timestamp
             mim.save()  # create or update
     except Exception as e:
         print("Error: moz_index_metadata:\n%s" % e)
     return mim
def getBackLinks(Url):
	#print Url
	bLinks=Set([])
	l = lsapi('member-a1c2050723', '9776ad0162ea4c492b2b4d56a0cfcd1a')
	linksList = l.links(Url)
	#print "\n\n", linksList
	for items in linksList:
		#print items
		for key, value in items.iteritems():
			if (key == "uu"):
				#print value	
				bLinks.add(value)
	#picking only 10 backlinks
	bLinks=list(bLinks)
	#print bLinks[0:2]
	
	for link in visitedPages:
		if link in bLinks:
			bLinks.remove(link)
			
	#print bLinks[0:2]
	# Because of the free API limitation.			
	time.sleep( 10 ) 
	return bLinks[0:2]
Exemplo n.º 8
0
#! /usr/bin/env python

from lsapi import lsapi

l = lsapi('my-access-id', 'my-secret-key')

# As you may have noticed, there are lots of columns available
# I did what I could to make them easily-accessible, but there
# are a lot, and the names are long. So, the API calls have
# defaults

# Let's get some URL metrics. Results are now an array of dictionaries
# the i'th dictionary is the results for the i'th URL
metrics = l.urlMetrics(['www.moz.com', 'www.moz.com/blog'])
# Now let's say we only want specific columns in the results
authorities = l.urlMetrics(['www.moz.com'], lsapi.UMCols.domainAuthority | lsapi.UMCols.pageAuthority)
# Or if you just need results for one URL
mozMetrics = l.urlMetrics('www.moz.com')

# Now for some anchor text results
anchorResults = l.anchorText('www.moz.com/blog')
# Or for just specific columns
anchorTermResults = l.anchorText('www.moz.com/blog', cols=lsapi.ATCols.term)

# Now for some links results
links = l.links('www.moz.com')
# The links API has more columns to specify, as well as sort, scope, etc.
links = l.links('www.moz.com', scope='domain_to_domain', sort='domain_authority',
	filters=['external', 'nofollow'], targetCols = lsapi.UMCols.url)
Exemplo n.º 9
0
from lsapi import lsapi
from lsapi import lsapiException
import time

# input: file with a list of seed urls
# output: file with a list of seed urls + backlinks

MOZ = { 'access_id': 'mozscape-d7201e2b23', 'secret_key': 'd605753f7d3a2f970353754a4b123b4c' }
l = lsapi(MOZ['access_id'], MOZ['secret_key'])
seeds_path = 'input/ebola-1000.txt'
result_path = 'input/ebolaSeeds-3.txt'

if __name__ == "__main__":

  print "Start backlink fetcher"
  
  with open(seeds_path, 'r') as seeds_fp:
    seeds = map(lambda x: x.replace("\n", ""), seeds_fp.readlines())

  backlinks_dict = {}
  cnt = 0

  # go through seeds, get backlinks from each and put in a dict[seed] = [backlinks]
  for seed in seeds:
    cnt += 1
    seed_backlinks = []
    try:
      links = l.links(seed, scope='page_to_page', sort='page_authority', filters=['external'], targetCols = lsapi.UMCols.url)
    except lsapiException, e:
      links = []
      print "lsapiException:", e
Exemplo n.º 10
0
# Seomoz Backlink Analysis

Discover relevant backlinks.
"""

import os
import csv
import time
import argparse

import numpy as np
import pandas as pd

from lsapi import lsapi

api = lsapi('member-76bd0a8077', '09e78de0f24fbbf8b41b46623b75d5e6')

parser = argparse.ArgumentParser(description='Seomoz Analyzer')

parser.add_argument('urls', help='path to list of urls to analyze')
parser.add_argument('links', help='path to list of urls to output')
parser.add_argument('--column', default='resulturl', help='column name')

args = parser.parse_args()

urls = pd.read_csv(args.urls)
urls = set(urls[args.column])

if os.path.exists(args.links):
    df = pd.read_csv(args.links)
    for url in set(df['url']):
Exemplo n.º 11
0
from lsapi import lsapi

l = lsapi("member-a1c2050723", "9776ad0162ea4c492b2b4d56a0cfcd1a")
print "Calling API"
mozMetrics = l.urlMetrics("http://www.google.com")
# print mozMetrics
# links = l.links('http://www.google.com')

links = l.links("www.soic.indiana.edu/computer-science/")
# print "\n\n", links
for items in links:
    # print items
    for key, value in items.iteritems():
        if key == "uu":
            print key, value
print "Call to API Ended"
Exemplo n.º 12
0
#! /usr/bin/env python

from lsapi import lsapi

l = lsapi('my-access-id', 'my-secret-key')

# As you may have noticed, there are lots of columns available
# I did what I could to make them easily-accessible, but there
# are a lot, and the names are long. So, the API calls have
# defaults

# Let's get some urlmetrics. Results are now an array of dictionaries
# the i'th dictionary is the results for the i'th URL
metrics = l.urlMetrics(['www.seomoz.org', 'www.seomoz.org/blog'])
# Now let's say we only want specific columns in the results
authorities = l.urlMetrics(['www.seomoz.org'], lsapi.UMCols.domainAuthority | lsapi.UMCols.pageAuthority)
# Or if you just need results for one URL
mozMetrics = l.urlMetrics('www.seomoz.org')

# Now for some anchor text results
anchorResults = l.anchorText('www.seomoz.org/blog')
# Or for just specific columns
anchorTermResults = l.anchorText('www.seomoz.org/blog', cols=lsapi.ATCols.term)

# Now for some links results
links = l.links('www.seomoz.org')
# The links API has more columns to specify, as well as sort, scope, etc.
links = l.links('www.seomoz.org', scope='domain_to_domain', sort='domain_authority',
	filters=['external', 'nofollow'], targetCols = lsapi.UMCols.url)