def getSkillPage( InputList, OutputQueue ):
     ProgressBar = tqdm( total=SkillPageCount, desc='Getting Skill Pages', unit='Skill Page' )
     while True:
         RandomLinkIndex = randint( 0, len( SkillPageLinkList ) )
         SkillPageLink = InputList[ RandomLinkIndex ]            # pick a random list item
         InputList.pop( RandomLinkIndex )                        # remove used link from list
         if SkillPageLink == None: break
         SkillPage = self.Session.get(						    # get the page and pass to next method
             SkillPageLink
             , headers=generate_navigator()
         )
         if SkillPage.status_code == 999:
             print( 'YOU ARE BEING BLOCKED | ATTEMTING RECONNECT...' )
             self.Session.close()                                # try opening up a new session
             sleep(uniform(self.MinSleepSecs + self.MaxSleepSecs, \
                           self.MaxSleepSecs * 2) )
             self.Session = requests.session()
             self._loginSession()
             SkillPage = self.Session.get(                       # get the page and pass to next method
                 SkillPageLink
                 , headers=generate_navigator()
             )
             if SkillPage.status_code == 999: 
                 print( 'RECONNECTION FAILED : GIVING UP' )
                 break
             else:
                 print( 'RECONNECTION SUCCEEDED : CONTINUING' )
         OutputQueue.put( ( SkillPage, SkillPageLink ) )		    # put results in the output queue
         ProgressBar.update( 1 )
         sleep(uniform(self.MinSleepSecs, self.MaxSleepSecs) )   # sleep for being a polite bot
Exemplo n.º 2
0
def test_feature_oscpu():
    for _ in range(10):
        nav = generate_navigator(os='win')
        assert 'Windows NT' in nav['oscpu']
        nav = generate_navigator(os='linux')
        assert 'Linux' in nav['oscpu']
        nav = generate_navigator(os='mac')
        assert 'Mac OS' in nav['oscpu']
Exemplo n.º 3
0
 def test_oscpu_value(self):
     for x in range(100):
         nav = generate_navigator(platform='win')
         self.assertTrue('Windows NT' in nav['oscpu'])
         nav = generate_navigator(platform='linux')
         self.assertTrue('Linux' in nav['oscpu'])
         nav = generate_navigator(platform='mac')
         self.assertTrue('Mac OS' in nav['oscpu'])
Exemplo n.º 4
0
def test_feature_oscpu():
    for _ in range(10):
        nav = generate_navigator(os='win')
        assert 'Windows NT' in nav['oscpu']
        nav = generate_navigator(os='linux')
        assert 'Linux' in nav['oscpu']
        nav = generate_navigator(os='mac')
        assert 'Mac OS' in nav['oscpu']
Exemplo n.º 5
0
def get_page_proxy(url, proxy_ip = None, headers = {}, t = 0):
    if t == R_MAX_LOOP:
        return None

    t += 1
    logging.info("get page: %s" % url)

    try:
        headers['User-agent'] = generate_navigator(navigator="chrome")['user_agent']
        try:
            if proxy_ip:
                logging.info("set proxy ip: %s" % proxy_ip)
                proxies = {
                    'http': 'http://%s' % proxy_ip,
                    'https': 'https://%s' % proxy_ip,
                }
                r = requests.get(url, proxies = proxies, timeout = 3, headers = headers)
            else:
                r = requests.get(url, timeout = 3, headers = headers)
            r.encoding = 'utf-8'
            if r.status_code == 200:
                return r.text
        except requests.Timeout as e:
            logging.error("Timeout ex %s" % e)
            return get_page_proxy(url = url, proxy_ip = proxy_ip, headers = headers, t = t)
        except requests.ConnectionError as e:
            logging.error("conn err %s" % e)
            return get_page_proxy(url = url, proxy_ip = proxy_ip, headers = headers, t = t)
        else:
            logging.error("status code: %s, ip: %s (req)" % (r.status_code, proxy_ip))
            return None
    except Exception as e:
        logging.error("error get page by req: %s" % e)
    return None
    def scrapeSkill( self, SkillPageLink, SkillPage=None ):
        '''
        Purpose:	Go to the skill page and scrape the list of companies and related skills with
                    counts representing how many times the link has been seen in the wild of LinkedIn.
        Arguments:
            SkillPageLink - str - url to the skill page
            SkillPage - requests page obj - [ optional ] the skill webpage to process
        Returns:
            SkillsDict - dict - dictionary of scraped skills data.
                                ex:
                                {
                                    'Companies' : [ ( 'Company A', 200 ), ( 'Company B', 900 ),...  ]
                                    ,'RelatedSkills' : [ ( 'Skill A', 200 ), ( 'Skill B', 900 ),...  ]
                                }
        '''

        if not self.SessionLoggedIn: self._loginSession()
        SkillsDict = {'Companies': [], 'RelatedSkills': []}
        if SkillPage == None:
            SkillPage = self.Session.get(
                SkillPageLink
                , headers=generate_navigator()
            )
        if SkillPage.status_code == 200:

            SkillPageXML = html.fromstring( SkillPage.text )
            try:
                Skill = SkillPageXML.xpath( '//h1[@class="page-title"]/text()' )[ 0 ]							# extract skill name from page
                SkillsDict.update( { 'Skill' : Skill } )
            except:
                print( 'No Skill Title Found : ' + str( SkillPageLink ) )

            # GET COMPANY & RELATION COUNT COMBOS
            try:
                CompanySkillStrings = \
                    SkillPageXML.xpath('//div[@class="stats-text-container"]/h3[contains(text(),"companies")]/..')[ 0 ].xpath(
                        './*/li/text()')
                for CompanySkillString in CompanySkillStrings:
                    Company, RelationCount = [ x.strip() for x in CompanySkillString.rsplit( '-', 1 ) ]
                    SkillsDict[ 'Companies' ].append( ( Company, RelationCount ) )
            except:
                print( 'No Company Skill Data : ' + str( SkillPageLink ) )

            # GET RELATED SKILLS
            try:
                RelatedSkillStrings = \
                    SkillPageXML.xpath('//div[@class="stats-text-container"]/h3[contains(text(),"skills")]/..')[ 0 ].xpath(
                        './*/li/text()')
                for RelatedSkillString in RelatedSkillStrings:
                    RelatedSkillValue, RelationCount = [ x.strip() for x in RelatedSkillString.rsplit( '-', 1 ) ]
                    SkillsDict[ 'RelatedSkills' ].append( (RelatedSkillValue, RelationCount) )
            except:
                print( 'No Related Skill Data : ' + str( SkillPageLink ) )

        return SkillsDict
    def _loginSession(self, Username=None, Password=None):
        '''
        Purpose:	Login to this website using the provided credentials.
                    Uses requests, not webdriver. Login for requests only,
                    which cannot apply to jobs, but operates much faster than selenium.
        Arguments:
            Username - str - email address / username for site
            Password - str - password for your account with this site
        Returns:
            LoginSuccessful - bool - True if login was successful; False otherwise
        '''
        if self.SessionLoggedIn: return True
        if Username != None: self.Username = Username
        if Password != None: self.Password = Password
        if self.Password == None or self.Username == None:
            raise ValueError('ERROR : LOGIN CREDENTIALS REQUIRED')

        # SAVE THE SECURITY INFO
        LoginPage = self.Session.get(SITE['Login']['Base'])
        LoginPageSoup = BeautifulSoup(LoginPage.text, 'lxml')
        LoginPayload = SITE['Login']['Payload']
        for SecurityParam in SITE['Login']['Security']:
            SecurityValue = \
                LoginPageSoup.find('input', {'name': SecurityParam})['value']
            LoginPayload.update({SecurityParam: SecurityValue})

        # FILL OUT USERNAME AND PASSWORD
        LoginPayload.update({'session_key': self.Username
                                , 'session_password': self.Password})

        # SEND LOGIN REQUEST
        LoginHeaders = \
            generate_navigator().update(
                {
                    'X-IsAJAXForm': '1'
                    , 'save-data': 'on'
                    , 'referer'	: SITE[ 'Login' ][ 'Base' ]
                }
            )
        LoginResultPage = self.Session.post(
            SITE[ 'Login-Submit' ] ,data=LoginPayload
            ,headers=LoginHeaders
        )

        # CHECK IF LOGIN SUCCESSFUL
        if LoginResultPage.status_code != 200:
            LoginSuccessful = False
        else:
            LoginSuccessful = True
            self.PublicIdentifier = self._getPublicIdentifier( LoginResultPage.text )
        self.SessionLoggedIn = LoginSuccessful

        return LoginSuccessful
 def __init__(self):
     self.uuid = str(uuid.uuid4())  # type: ignore
     random_device_os = random.choice(DEVICE_OS_CHOICES)
     self.ip_address: str = generate_random_ip()
     generated_technical_data = generate_navigator()
     self.base_properties: dict = {
         'uuid': self.uuid,
         '$ip': self.ip_address,
         'Browser': generated_technical_data['app_code_name'],
         **generated_technical_data,
     }
     self.properties = self.base_properties
Exemplo n.º 9
0
    def get_new_user_agent(self):
        """
        Gets a new user agent string from the user_agent module, making sure that if one has already been selected, it's
        not reused.

        :returns: A brand new user agent string.
        :rtype: str
        """
        new_user_agent = user_agent.generate_navigator()["user_agent"]
        if new_user_agent == self.user_agent:
            self.get_new_user_agent()

        return new_user_agent
Exemplo n.º 10
0
def test_build_id_firefox():
    orig_ff_ver = deepcopy(user_agent.base.FIREFOX_VERSION)
    user_agent.base.FIREFOX_VERSION = [
        ('49.0', datetime(2016, 9, 20)),
        ('50.0', datetime(2016, 11, 15)),
    ]
    try:
        for _ in range(50):
            nav = generate_navigator(navigator='firefox')
            assert len(nav['build_id']) == 14
            if '50.0' in nav['user_agent']:
                assert nav['build_id'].startswith('20161115')
            else:
                time_ = datetime.strptime(nav['build_id'], '%Y%m%d%H%M%S')
                assert datetime(2016, 9, 20, 0) <= time_
                assert time_ < datetime(2016, 11, 15)
    finally:
        user_agent.base.FIREFOX_VERSION = orig_ff_ver
Exemplo n.º 11
0
def test_build_id_firefox():
    from user_agent import base

    orig_ff_ver = deepcopy(base.FIREFOX_VERSION)
    base.FIREFOX_VERSION = [
        ('49.0', datetime(2016, 9, 20)),
        ('50.0', datetime(2016, 11, 15)),
    ]
    try:
        for _ in range(50):
            nav = generate_navigator(navigator='firefox')
            assert len(nav['build_id']) == 14
            if '50.0' in nav['user_agent']:
                assert nav['build_id'].startswith('20161115')
            else:
                time_ = datetime.strptime(nav['build_id'], '%Y%m%d%H%M%S')
                assert datetime(2016, 9, 20, 0) <= time_
                assert time_ < datetime(2016, 11, 15)
    finally:
        base.FIREFOX_VERSION = orig_ff_ver
Exemplo n.º 12
0
def thread_worker(thread_id, timeout, SPOOF_UA):

    while True:

        try:
            ip = a.ips.pop(0)
        except IndexError:
            print(f"[Thread #{thread_id}] [EOL]")   
            break 

        count = 0
        this_ip_port_data = {"ip": ip, "open_ports": []}

        for port in a.port_types:
            portnum = str(port["port"])
            #print(f"[Thread #{thread_id}] Trying {ip}:{portnum}....")

            try:
                if SPOOF_UA: headers = {"User-Agent": generate_navigator()["user_agent]}
                else: headers = {"User-Agent": "Mozilla/5.0 (compatible; http-sevice-discovery/1.0; +https://mysite.io)"}    

                r = get(f"http://{ip}:" + str(port["port"]), timeout=timeout, headers=headers)
                
                if r.status_code not in a.status_codes_to_ignore and len(r.headers) > 0 and [ele for ele in a.false_positives if(ele in r.text.lower())] == False:
                    this_ip_port_data["open_ports"].append({"port_number": port["port"], "port_info": port, "raw": r.text})
                    a.final_data.append(this_ip_port_data) 
                    a.hits += 1

                    while True:
                        # Threads wait their turn
                        if a.filelocked == False:
                            a.filelocked = True
                            json_object = json.dumps(a.final_data, indent = 4)
                            with open(a.s_hash + ".json", "w+") as t:
                                t.write(json_object)
                            a.filelocked = False    
                            break    

            except:
Exemplo n.º 13
0
def get_url_content(url):
    back = requests.get(url, headers=user_agent.generate_navigator(os='win'))
    return back.content
Exemplo n.º 14
0
def test_build_id_nofirefox():
    for _ in range(50):
        nav = generate_navigator(navigator='chrome')
        assert nav['build_id'] is None
        nav = generate_navigator(navigator='ie')
        assert nav['build_id'] is None
Exemplo n.º 15
0
def test_data_integrity():
    for _ in range(50):
        nav = generate_navigator()
        for _, val in nav.items():
            assert val is None or isinstance(val, six.string_types)
Exemplo n.º 16
0
def test_feature_platform():
    for _ in range(50):
        nav = generate_navigator(os='win')
        assert 'Win' in nav['platform']
        nav = generate_navigator(os='linux')
        assert 'Linux' in nav['platform']
Exemplo n.º 17
0
def test_data_integrity():
    for _ in range(50):
        nav = generate_navigator()
        for _, val in nav.items():
            assert val is None or isinstance(val, six.string_types)
Exemplo n.º 18
0
 def __init__(self):
     """ pass """
     self.session = Session()
     self.session.headers = generate_navigator()
Exemplo n.º 19
0
def test_feature_platform():
    for _ in range(50):
        nav = generate_navigator(os='win')
        assert 'Win' in nav['platform']
        nav = generate_navigator(os='linux')
        assert 'Linux' in nav['platform']
Exemplo n.º 20
0
 def test_platform_value(self):
     for x in range(100):
         nav = generate_navigator(platform='win')
         self.assertTrue('Win' in nav['platform'])
         nav = generate_navigator(platform='linux')
         self.assertTrue('Linux' in nav['platform'])
Exemplo n.º 21
0
 def test_data_integrity(self):
     for x in range(100):
         nav = generate_navigator()
         for key, val in nav.items():
             self.assertTrue(isinstance(val, six.string_types))
Exemplo n.º 22
0
# coding=utf-8
"""
douban首页headers
"""
import user_agent
ua = user_agent.generate_navigator(os=None, navigator=None, platform=None, device_type=None)

douban_home_headers = {
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
	'Accept-Encoding': 'gzip, deflate, br',
	'Accept-Language': 'zh-CN,zh;q=0.9',
	'Cache-Control': 'max-age=0',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	'User-Agent': ua["user_agent"]
}

'''douban ajax拉取电影列表'''
douban_referer_tag_headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Host': 'movie.douban.com',
    'Referer': 'https://movie.douban.com/tag/',
    'User-Agent': ua["user_agent"]
}

'''douban movies search headers'''
douban_ajax_search_headers = {
    'Accept': '*/*',
Exemplo n.º 23
0
def test_build_id_nofirefox():
    for _ in range(50):
        nav = generate_navigator(navigator='chrome')
        assert nav['build_id'] is None
        nav = generate_navigator(navigator='ie')
        assert nav['build_id'] is None
    def _getLinksList( self, FilePath='SkillPageLinks.txt', StoreFilePath='SkillPageLinks.txt' ):
        '''
        Purpose: 	Get the skill page links either by crawling the LinkedIn directory index pages
                    or by loading from a file.
        Arguments:
            FilePath - str - [ optional ] path to the file where the skill page links are kept
                                By default, an attempt will be made to load the links from the default file path.
                                If None, the LinkedIn website will be scraped for the links.
            StoreFilePath - str - [ optional ]Path to the file where you will store the scraped skill page links.
                                    If None, any scraping results will not be stored in a file.
        Returns:
            SkillPageLinks - list of str - list of skill page links
        '''

        # LOAD FROM FILE IS POSSIBLE
        if FilePath != None:
            with open( FilePath, 'r' ) as SkillPageLinksFile:
                Reader = csv.reader( SkillPageLinksFile, delimiter=',' )
                SkillPageLinks = [ Row[ 0 ] for Row in Reader if len( Row ) > 0 ]
                return SkillPageLinks

        if not self.SessionLoggedIn: self._loginSession()

        SkillsBasePage = self.Session.get( \
            SITE[ 'Skills' ][ 'Base' ]
            ,headers=generate_navigator()
        )
        sleep(uniform(self.MinSleepSecs,self.MaxSleepSecs))
        SkillBaseXML = html.fromstring( SkillsBasePage.text )

        # COLLECT ALL TOPIC NAMES
        TopicNameElms = SkillBaseXML.xpath( '//ol[@class="bucket-list"]/li/a' )
        TopicNames = [ ( TopicNameElm.text.lower(), TopicNameElm.attrib[ 'href' ] ) \
                        for TopicNameElm in TopicNameElms ]
        ProgressBar = tqdm(total=len( TopicNames ), desc='Finding Skill Page Links', unit='Topic')
        SkillPageLinks = []

        for TopicName, TopicLink in TopicNames:

            # SPECIAL CASE FOR #
            if TopicName == '#': TopicName = 'more'

            # GET COUNT OF NUMBER OF SKILL PARENT PAGES FOR THIS TOPIC
            TopicPage = self.Session.get(
                TopicLink
                ,headers=generate_navigator()
            )
            TopicPageXML = html.fromstring( TopicPage.text )
            TopicPageElms = TopicPageXML.xpath( '//div[@class="section last"]/div/ul/li/a' )

            # GET THE URLS FOR THE PARENT PAGES OF SKILLS PAGES
            SkillParentPageLinks = []
            for iTopic in range( 1, len( TopicPageElms ) + 1 ):
                if 'topics-' in TopicPageElms[ iTopic - 1 ].attrib[ 'href' ]:											# if this is still a parent in dex page
                    SkillParentPageLink = SITE[ 'Skills' ][ 'Base' ][ :-1 ] + \
                                           SITE[ 'Skills' ][ 'TopicLinks' ]
                    SkillParentPageLink = SkillParentPageLink.format( TopicName=TopicName, TopicIndex=iTopic )
                else:																					# if this is a base skill page
                    SkillParentPageLink = TopicPageElms[ iTopic - 1 ].attrib[ 'href' ]
                SkillParentPageLinks.append(SkillParentPageLink)

            # VISIT EACH SKILL PARENT INDEX PAGE AND GET ALL OF THE URLS
            for SkillParentPageLink in SkillParentPageLinks:
                if 'topics-' in SkillParentPageLink:										# when topics are three layers deep
                    SkillParentPage = self.Session.get(
                        SkillParentPageLink
                        ,headers=generate_navigator()
                    )
                    sleep(uniform(self.MinSleepSecs, self.MaxSleepSecs))
                    SkillParentPageXML = html.fromstring( SkillParentPage.text )
                    SkillPageElms = SkillParentPageXML.xpath( '//div[@class="section last"]/div/ul/li/a' )
                    SkillPageLinks += [ SkillPageElm.attrib[ 'href' ] for SkillPageElm in SkillPageElms ]
                else:																		# some topics are only two layers deep
                    SkillPageLinks.append( SkillParentPageLink )

            ProgressBar.update( 1 )

        # SAVE ALL SKILL PAGE LINKS TO FILE AS CSV
        if StoreFilePath != None:
            with open( StoreFilePath, 'w') as SkillPageLinksFile:
                Writer = csv.writer( SkillPageLinksFile )
                SkillPageLinksArray = [ [ x ] for x in SkillPageLinks if not x in ( '', None ) ]
                Writer.writerows( SkillPageLinksArray )

        return SkillPageLinks