def get_contact_domains(): db = DB(f='./json/01.json') json = db.db domains = [] for key in json: domains.append(key) return domains
def main(): db = DB(f='03.json') json = db.db words = json['words'] sorted_x = sorted(words.items(), key=operator.itemgetter(1)) for x in sorted_x: print x
def main(): # Init DB db = DB(f = './json/03.json') json = db.db print db.f, len(db.db) # Shortcuts :) br_1 = '-' * 100 br_2 = '-' * 80 # Get Archive! archive = ''.join(get_archive()) # Process archive = archive.split(br_1) keywords = archive[0] archive = archive[1] sites = archive.split(br_2) sites = sites[:-1] for site in sites: s = site.split('\n') for x in s: if x == '': s.remove('') if len(s) > 1: domain = s[0].replace('file: ', '') words = [] i = 1 while i < len(s): words.append(s[i]) i += 1 if domain not in json['domains']: json['domains'][domain] = [word for word in words] else: for word in words: json['domains'][domain].append(word) for word in words: if word not in json['words']: json['words'][word] = 1 else: json['words'][word] += 1 # Update DB db.update(json) db.save()
def doWork(self): while True: self.domain = int(self.q.get()) self.txt = self.get_txt(self.domain) try: self.s = self.search_forms(txt = self.txt) if self.s: self.json[self.sites[self.domain]] = self.s except: pass if len(self.json) % 50 == 0: print strftime('%H:%S') if len(self.json) == self.lines - 150000 - 2: self.db.update(self.json) self.db.save() self.dbs += 1 #self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} self.q.task_done()
def __init__(self): # Initialize Database self.dbs = 70 self.new_db() self.db = DB(f='./json/forms/forms-%d.json' % self.dbs) self.json = {} # Stuff self.txt = '' self.sites = self.get_files() self.lines = len(self.sites) self.concurrent = 1 self.q = Queue(self.concurrent * 2) for self.i in range(self.concurrent): self.t = Thread(target=self.doWork) self.t.daemon = True self.t.start() try: for self.i in xrange(self.dbs * 2000, self.lines): self.q.put(self.i) self.q.join() except KeyboardInterrupt: sys.exit(1)
def get_word_domains(): db = DB(f='./json/03.json') json = db.db domains = json['domains'] return domains
def main(): # Init DB db = DB(f='./json/01.json') json = db.db print db.f, len(db.db) # Shortcuts :) br_1 = '-' * 100 br_2 = '-' * 80 # Get Archive! archive = ''.join(get_archive()) # Process archive = archive.split(br_1) keywords = archive[0] archive = archive[1] sites = archive.split(br_2) sites = sites[:-1] for site in sites: s = site.split('\n') for x in s: if x == '': s.remove('') if len(s) > 2: url = s[0].replace('file: ', '') i = 2 while i < len(s): row = s[i] f = None m = None if len(row) > 0: if row[0] == 'f': _f = row[13:] if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f: if 'facebook.com' in _f: if _f != 'facebook.com' and _f != 'facebook.com/': f = _f if row[0] == 'm': _m = row[8:] if '@' in _m: m = _m if f or m: json[url] = {} if f: json[url]['facebook'] = f if m: json[url]['email'] = m i += 1 #print json # Update DB db.update(json) db.save()