Пример #1
0
def get_site(url):
  try:
	s=urlopen(url).read()
  except IOError:
	return None
  # rediff fix
  #s=s.lower()
  psplit='type="password"'
  if not(psplit in s):
	psplit='type=password'
  	if not(psplit in s):
		psplit="type=PASSWORD"
		if not(psplit in s):
			pssplit='type=PASSWORD'
			if not(psplit in s):
				pssplit="type = password"
  				if not(psplit in s):
					pssplit='type = password'
  #till here
  t1=s.partition(psplit)
  #generelizing new
  frmstrt="<form"
  if not(frmstrt in t1[0]):
	frmstrt="<FORM"
	if not(frmstrt in t1[0]):
		frmstrt="<Form"
  frmend="/form>"
  if not(frmend in t1[2]):
	frmend="/FORM>"
  #till here 


  f=frmstrt+(t1[0].rpartition(frmstrt)[2]+t1[1]+t1[2]).partition(frmend)[0]+frmend
  #newly added code to check for relative url in action
  actn='action="'
  if not (actn in f):
	actn='action ="'
	if not(actn in f):
		actn='action= "'
		if not( actn in f):
			actn='action = "'

  s=f.partition(actn)
  t1=s[2].partition('"')
  if not t1[0].startswith('/'):
	tmp='/'+t1[0]
  else:
	tmp=t1[0]
  if not("http" in t1[0]):
	f=s[0]+s[1]+url+tmp+t1[1]+t1[2]

  #generalising of algn
  if "INPUT" in f:
        f=re.sub("INPUT","input",f)
  if "TEXT" in f:
        f=re.sub("TEXT","text",f)
  if "PASSWORD" in f:
        f=re.sub("PASSWORD","password",f)
  if "VALUE" in f:
        f=re.sub("VALUE","value",f)

  site=Website(url=url,subform=convert2asci(f))
  site.save()
  return site 
Пример #2
0
def get_site(url):
  geturl="http://"+url
  for i in range(4):
    try:
        fp=urlopen(geturl)
        break
    except IOError:
        print "IoError" ##do 4 times checking before returning failed(cant trust network)
        fp = None
  if fp==None:
        return None #handle it in calling func 
  #new in V4 for icon
  data=fp.read()
  hdr=fp.headers['content-type']
  endurl=fp.geturl()
  fp.close()
  #till here
  s = data
  f=get_lgnform(s)
  if f==None:
	pass #call LFD here
  

  #generalising of algn
  #generalising using reg exp
  #possible issues--> whn some id attrib(or js) has 'input' or 'value' in its name (not sure if CS)
  reg1=re.compile(r"input",re.I)
  f=re.sub(reg1,'input',f)
  reg1=re.compile(r"value",re.I)
  f=re.sub(reg1,'value',f)
  reg1=re.compile(r"""type( )*=( )*('|")?text('|")?""",re.I)
  f=re.sub(reg1,'type="text"',f)
  reg1=re.compile(r"""type( )*=( )*('|")?password('|")?""",re.I)
  f=re.sub(reg1,'type="password"',f)
  reg1=re.compile(r"""type( )*=( )*('|")?submit('|")?""",re.I)
  f=re.sub(reg1,'type="submit"',f)
  reg1=re.compile(r"""action( )*=( )*("|')""",re.I)
  f=re.sub(reg1,'action="',f)
  

  #newly added code to check for relative url in action
  #todo-> here also use regexp
  actn='action="'
  s=f.partition(actn)
  t1=s[2].partition('"')
  if not t1[0].startswith('/'):
	tmp='/'+t1[0]
  else:
	tmp=t1[0]
  if not("http" in t1[0]):
	f=s[0]+s[1]+endurl+tmp

  #new code to manage uvl_sub
  tysub='type="submit"'
  t1=f.partition(tysub)
  f= t1[0]+t1[1]+' id="uvl_sub" '+t1[2]
  #till here

  site=Website(url=url,subform=convert2asci(f))
  site.save()
  try:
	saveIcon(geturl,data,hdr,site.id)
  except:
	pass
  return site