def dumps_to_db(): j = Jsondb() db = j.db print len(db) dumps = get_dumps() for line in dumps: try: d = get_domain_parts('http://' + line) u = urlparse('http://' + line) p = u.path url = '%s.%s' % (d.domain, d.tld) if url not in db: db[url] = { 'subdomains': [], 'paths': [] } if p not in db[url]['paths']: db[url]['paths'].append(p) if d.subdomains: for domain in d.subdomains: if domain not in db[url]['subdomains']: db[url]['subdomains'].append(domain) except ValueError: pass j.update(db) print len(j.db) #j._print() j.save()
def main(): # Init DB db = DB(f = './json/03.json') json = db.db print db.f, len(db.db) # Shortcuts :) br_1 = '-' * 100 br_2 = '-' * 80 # Get Archive! archive = ''.join(get_archive()) # Process archive = archive.split(br_1) keywords = archive[0] archive = archive[1] sites = archive.split(br_2) sites = sites[:-1] for site in sites: s = site.split('\n') for x in s: if x == '': s.remove('') if len(s) > 1: domain = s[0].replace('file: ', '') words = [] i = 1 while i < len(s): words.append(s[i]) i += 1 if domain not in json['domains']: json['domains'][domain] = [word for word in words] else: for word in words: json['domains'][domain].append(word) for word in words: if word not in json['words']: json['words'][word] = 1 else: json['words'][word] += 1 # Update DB db.update(json) db.save()
def update_db(): j = Jsondb(f='/home/johnny/dev/backzupz/spider/a/_dumps/db_15-05-26.json') db = j.db for line in db: try: d = get_domain_parts('http://' + line) db[line]['tld'] = d.tld except ValueError: print line j.update(db) j.save()
def dumps_to_db(): j = Jsondb() db = j.db print len(db) dumps = get_dumps() for line in dumps: try: d = get_domain_parts('http://' + line) u = urlparse('http://' + line) p = u.path url = '%s.%s' % (d.domain, d.tld) if url not in db: db[url] = {'subdomains': [], 'paths': []} if p not in db[url]['paths']: db[url]['paths'].append(p) if d.subdomains: for domain in d.subdomains: if domain not in db[url]['subdomains']: db[url]['subdomains'].append(domain) except ValueError: pass j.update(db) print len(j.db) #j._print() j.save()
def domain_stats(): j = Jsondb(f='/home/johnny/dev/backzupz/spider/a/_dumps/db_15-05-26.json') db = j.db stats = {} for line in db: tld = db[line]['tld'] if tld in stats: stats[tld] += 1 else: stats[tld] = 1 return stats
def doWork(self): while True: self.domain = int(self.q.get()) self.txt = self.get_txt(self.domain) try: self.s = self.search_forms(txt = self.txt) if self.s: self.json[self.sites[self.domain]] = self.s except: pass if len(self.json) % 50 == 0: print strftime('%H:%S') if len(self.json) == self.lines - 150000 - 2: self.db.update(self.json) self.db.save() self.dbs += 1 #self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} self.q.task_done()
def __init__(self): # Initialize Database self.dbs = 70 self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} # Stuff self.txt = '' self.sites = self.get_files() self.lines = len(self.sites) self.concurrent = 1 self.q = Queue(self.concurrent * 2) for self.i in range(self.concurrent): self.t = Thread(target=self.doWork) self.t.daemon = True self.t.start() try: for self.i in xrange(self.dbs * 2000, self.lines): self.q.put(self.i) self.q.join() except KeyboardInterrupt: sys.exit(1)
def main(): # Init DB db = DB(f = './json/01.json') json = db.db print db.f, len(db.db) # Shortcuts :) br_1 = '-' * 100 br_2 = '-' * 80 # Get Archive! archive = ''.join(get_archive()) # Process archive = archive.split(br_1) keywords = archive[0] archive = archive[1] sites = archive.split(br_2) sites = sites[:-1] for site in sites: s = site.split('\n') for x in s: if x == '': s.remove('') if len(s) > 2: url = s[0].replace('file: ', '') i = 2 while i < len(s): row = s[i] f = None; m = None if len(row) > 0: if row[0] == 'f': _f = row[13:] if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f: if 'facebook.com' in _f: if _f != 'facebook.com' and _f != 'facebook.com/': f = _f if row[0] == 'm': _m = row[8:] if '@' in _m: m = _m if f or m: json[url] = {} if f: json[url]['facebook'] = f if m: json[url]['email'] = m i += 1 #print json # Update DB db.update(json) db.save()
class Work: def __init__(self): # Initialize Database self.dbs = 70 self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} # Stuff self.txt = '' self.sites = self.get_files() self.lines = len(self.sites) self.concurrent = 1 self.q = Queue(self.concurrent * 2) for self.i in range(self.concurrent): self.t = Thread(target=self.doWork) self.t.daemon = True self.t.start() try: for self.i in xrange(self.dbs * 2000, self.lines): self.q.put(self.i) self.q.join() except KeyboardInterrupt: sys.exit(1) def new_db(self): with open('./json/forms/forms-%d.json' % self.dbs, 'w+') as f: f.write('{}') def doWork(self): while True: self.domain = int(self.q.get()) self.txt = self.get_txt(self.domain) try: self.s = self.search_forms(txt = self.txt) if self.s: self.json[self.sites[self.domain]] = self.s except: pass if len(self.json) % 50 == 0: print strftime('%H:%S') if len(self.json) == self.lines - 150000 - 2: self.db.update(self.json) self.db.save() self.dbs += 1 #self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} self.q.task_done() def control(self, c): name = c.get('name') _id = c.get('id') action = c.get('action') method = c.get('method') _class = c.get('class') _type = c.get('type') disabled = c.get('disabled') control = {} if name: control['name'] = name if _id: control['id'] = _id if action: control['action'] = action if method: control['method'] = method if _class: control['class'] = _class if _type: control['type'] = _type if disabled: control['disabled'] = disabled return control def search_forms(self, txt): # Soup Object self.soup = bs4(txt) self.forms = self.soup.findAll('form') self.form_len = len(self.forms) if self.form_len < 0: return False self.site = {} self.i = 0 for self.form in self.forms: self._f = self.control(self.form) self._key = str(self.i) self.site[self._key] = {} for self.key in self._f: self.site[self._key][self.key] = self._f[self.key] self.inputs = self.forms[self.i].findAll('input') self.site[self._key]['inputs'] = [] for self._input in self.inputs: self._in = {} self._i = self.control(self._input) for self.key in self._i: self._in[self.key] = self._i[self.key] self.site[self._key]['inputs'].append(self._in) self.i += 1 return self.site def get_txt(self, domain): # Read Data. cd = '' with open('%s/%s' % ( DUMPS_DIR, self.sites[domain]), 'r') as f: cd = f.read() # Decompress return zlib.decompress(cd) def get_files(self): txt = '' with open('../dumps/04-forms', 'r') as f: txt = f.read() # Sanatize. txt = txt.split('-' * 100) txt = txt[1] txt = txt.split('-' * 5)[:-1] i = 0 for x in txt: txt[i] = txt[i].strip() i += 1 for x in txt: if x == '': txt.remove('') # Search. sites = [] for site in txt: sites.append( site.split('\n')[-1].replace('file: ', '') ) return sites
def main(): # Init DB db = DB(f='./json/01.json') json = db.db print db.f, len(db.db) # Shortcuts :) br_1 = '-' * 100 br_2 = '-' * 80 # Get Archive! archive = ''.join(get_archive()) # Process archive = archive.split(br_1) keywords = archive[0] archive = archive[1] sites = archive.split(br_2) sites = sites[:-1] for site in sites: s = site.split('\n') for x in s: if x == '': s.remove('') if len(s) > 2: url = s[0].replace('file: ', '') i = 2 while i < len(s): row = s[i] f = None m = None if len(row) > 0: if row[0] == 'f': _f = row[13:] if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f: if 'facebook.com' in _f: if _f != 'facebook.com' and _f != 'facebook.com/': f = _f if row[0] == 'm': _m = row[8:] if '@' in _m: m = _m if f or m: json[url] = {} if f: json[url]['facebook'] = f if m: json[url]['email'] = m i += 1 #print json # Update DB db.update(json) db.save()