def get_site(url): try: s=urlopen(url).read() except IOError: return None # rediff fix #s=s.lower() psplit='type="password"' if not(psplit in s): psplit='type=password' if not(psplit in s): psplit="type=PASSWORD" if not(psplit in s): pssplit='type=PASSWORD' if not(psplit in s): pssplit="type = password" if not(psplit in s): pssplit='type = password' #till here t1=s.partition(psplit) #generelizing new frmstrt="<form" if not(frmstrt in t1[0]): frmstrt="<FORM" if not(frmstrt in t1[0]): frmstrt="<Form" frmend="/form>" if not(frmend in t1[2]): frmend="/FORM>" #till here f=frmstrt+(t1[0].rpartition(frmstrt)[2]+t1[1]+t1[2]).partition(frmend)[0]+frmend #newly added code to check for relative url in action actn='action="' if not (actn in f): actn='action ="' if not(actn in f): actn='action= "' if not( actn in f): actn='action = "' s=f.partition(actn) t1=s[2].partition('"') if not t1[0].startswith('/'): tmp='/'+t1[0] else: tmp=t1[0] if not("http" in t1[0]): f=s[0]+s[1]+url+tmp+t1[1]+t1[2] #generalising of algn if "INPUT" in f: f=re.sub("INPUT","input",f) if "TEXT" in f: f=re.sub("TEXT","text",f) if "PASSWORD" in f: f=re.sub("PASSWORD","password",f) if "VALUE" in f: f=re.sub("VALUE","value",f) site=Website(url=url,subform=convert2asci(f)) site.save() return site
def get_site(url): geturl="http://"+url for i in range(4): try: fp=urlopen(geturl) break except IOError: print "IoError" ##do 4 times checking before returning failed(cant trust network) fp = None if fp==None: return None #handle it in calling func #new in V4 for icon data=fp.read() hdr=fp.headers['content-type'] endurl=fp.geturl() fp.close() #till here s = data f=get_lgnform(s) if f==None: pass #call LFD here #generalising of algn #generalising using reg exp #possible issues--> whn some id attrib(or js) has 'input' or 'value' in its name (not sure if CS) reg1=re.compile(r"input",re.I) f=re.sub(reg1,'input',f) reg1=re.compile(r"value",re.I) f=re.sub(reg1,'value',f) reg1=re.compile(r"""type( )*=( )*('|")?text('|")?""",re.I) f=re.sub(reg1,'type="text"',f) reg1=re.compile(r"""type( )*=( )*('|")?password('|")?""",re.I) f=re.sub(reg1,'type="password"',f) reg1=re.compile(r"""type( )*=( )*('|")?submit('|")?""",re.I) f=re.sub(reg1,'type="submit"',f) reg1=re.compile(r"""action( )*=( )*("|')""",re.I) f=re.sub(reg1,'action="',f) #newly added code to check for relative url in action #todo-> here also use regexp actn='action="' s=f.partition(actn) t1=s[2].partition('"') if not t1[0].startswith('/'): tmp='/'+t1[0] else: tmp=t1[0] if not("http" in t1[0]): f=s[0]+s[1]+endurl+tmp #new code to manage uvl_sub tysub='type="submit"' t1=f.partition(tysub) f= t1[0]+t1[1]+' id="uvl_sub" '+t1[2] #till here site=Website(url=url,subform=convert2asci(f)) site.save() try: saveIcon(geturl,data,hdr,site.id) except: pass return site