예제 #1
0
파일: final.py 프로젝트: hoytnix/spidey
def get_contact_domains():
    db = DB(f='./json/01.json')
    json = db.db

    domains = []
    for key in json:
        domains.append(key)
    return domains
예제 #2
0
파일: sort.py 프로젝트: hoytnix/spidey
def main():
    db = DB(f='03.json')
    json = db.db

    words = json['words']

    sorted_x = sorted(words.items(), key=operator.itemgetter(1))
    for x in sorted_x:
        print x
예제 #3
0
파일: 03.py 프로젝트: hoytnix/spidey
def main():
  # Init DB
  db = DB(f = './json/03.json')
  json = db.db
  print db.f, len(db.db)

  # Shortcuts :)
  br_1 = '-' * 100
  br_2 = '-' * 80

  # Get Archive!
  archive = ''.join(get_archive())
  
  # Process
  archive = archive.split(br_1)

  keywords = archive[0]
  archive  = archive[1]

  sites = archive.split(br_2)
  sites = sites[:-1]
  
  for site in sites:
    s = site.split('\n')

    for x in s:
      if x == '':
        s.remove('')
    
    if len(s) > 1:
      domain = s[0].replace('file: ', '')

      words = []
      i = 1
      while i < len(s):
        words.append(s[i])
        i += 1

      if domain not in json['domains']:
        json['domains'][domain] = [word for word in words]
      else:
        for word in words:
          json['domains'][domain].append(word)

      for word in words:
        if word not in json['words']:
          json['words'][word] = 1
        else:
          json['words'][word] += 1

  # Update DB
  db.update(json)
  db.save()
예제 #4
0
파일: forms.py 프로젝트: hoytnix/spidey
    def doWork(self):
        while True:
            self.domain = int(self.q.get())
            self.txt = self.get_txt(self.domain)

            try:
              self.s = self.search_forms(txt = self.txt)
              if self.s:
                self.json[self.sites[self.domain]] = self.s
            except:
              pass

            if len(self.json) % 50 == 0:
              print strftime('%H:%S')

            if len(self.json) == self.lines - 150000 - 2:
              self.db.update(self.json)
              self.db.save()
              self.dbs += 1
              #self.new_db()
              self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
              self.json = {}

            self.q.task_done()    
예제 #5
0
파일: forms.py 프로젝트: hoytnix/spidey
    def __init__(self):
        
        # Initialize Database
        self.dbs = 70
        self.new_db()
        self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
        self.json = {}

        # Stuff
        self.txt = ''
        self.sites = self.get_files()
        self.lines = len(self.sites)
        self.concurrent = 1
        self.q = Queue(self.concurrent * 2)
        for self.i in range(self.concurrent):
            self.t = Thread(target=self.doWork)
            self.t.daemon = True
            self.t.start()
        try:
            for self.i in xrange(self.dbs * 2000, self.lines):
                self.q.put(self.i)
            self.q.join()
        except KeyboardInterrupt:
            sys.exit(1)
예제 #6
0
파일: final.py 프로젝트: hoytnix/spidey
def get_word_domains():
    db = DB(f='./json/03.json')
    json = db.db

    domains = json['domains']
    return domains
예제 #7
0
파일: 01.py 프로젝트: hoytnix/spidey
def main():
    # Init DB
    db = DB(f='./json/01.json')
    json = db.db
    print db.f, len(db.db)

    # Shortcuts :)
    br_1 = '-' * 100
    br_2 = '-' * 80

    # Get Archive!
    archive = ''.join(get_archive())

    # Process
    archive = archive.split(br_1)

    keywords = archive[0]
    archive = archive[1]

    sites = archive.split(br_2)
    sites = sites[:-1]

    for site in sites:
        s = site.split('\n')

        for x in s:
            if x == '':
                s.remove('')

        if len(s) > 2:

            url = s[0].replace('file: ', '')

            i = 2
            while i < len(s):
                row = s[i]

                f = None
                m = None
                if len(row) > 0:
                    if row[0] == 'f':
                        _f = row[13:]
                        if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f:
                            if 'facebook.com' in _f:
                                if _f != 'facebook.com' and _f != 'facebook.com/':
                                    f = _f
                    if row[0] == 'm':
                        _m = row[8:]
                        if '@' in _m:
                            m = _m

                if f or m:
                    json[url] = {}
                    if f:
                        json[url]['facebook'] = f
                    if m:
                        json[url]['email'] = m

                i += 1

    #print json

    # Update DB
    db.update(json)
    db.save()