def test_extractJobInfo(self):
        keys = batchExtractMatches(numPages=1,
                                   searchTerm="software engineer",
                                   searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                   tor=False,
                                   port=9050,
                                   matchSingleWriteLocation=None,
                                   userAgent=None,
                                   sleepTime=None,
                                   htmlWriteLocations=None,
                                   matchWriteLocations=None,
                                   pageIncrements=10)

        key = keys[0]
        res = extractJobInfo(key,
                             replaceDict,
                             technologies={'software': ['software']},
                             tor=False,
                             port=9050)

        self.assertIn("posted", res.keys())
        self.assertIn("city", res.keys())
        self.assertIn("state", res.keys())
        self.assertIn("technologies", res.keys())
        self.assertIn("jobkey", res.keys())
    def test_getLocation(self):
        testLen = 15  #Must adjust numPages if this is set to 20 or higher
        keys = batchExtractMatches(numPages=1,
                                   searchTerm="software engineer",
                                   searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                   tor=False,
                                   port=9050,
                                   matchSingleWriteLocation=None,
                                   userAgent=None,
                                   sleepTime=None,
                                   htmlWriteLocations=None,
                                   matchWriteLocations=None,
                                   pageIncrements=10)
        locs = []
        for key in keys[:testLen]:
            soup = extractJobHTML(jobKey=key,
                                  tor=False,
                                  port=None,
                                  prettify=False)
            locs.append(getLocation(soup, replaceDict))

        for loc in locs:
            if loc == None:
                continue
            self.assertRegex(loc[0], "[a-z]+")
            self.assertRegex(loc[1], "[a-z]{2}")

        self.assertEqual(len(locs), testLen)
 def test_batchExtractMatches(self):
     keys = batchExtractMatches(numPages=1,
                                searchTerm="software engineer",
                                searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                tor=False,
                                port=9050,
                                matchSingleWriteLocation=None,
                                userAgent=None,
                                sleepTime=None,
                                htmlWriteLocations=None,
                                matchWriteLocations=None,
                                pageIncrements=10)
     self.assertIsInstance(keys, list)
     self.assertRegex(keys[0], "[a-zA-Z0-9]")
 def test_extractJobHTML(self):
     keys = batchExtractMatches(numPages=1,
                                searchTerm="data scientist",
                                searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                tor=False,
                                port=9050,
                                matchSingleWriteLocation=None,
                                userAgent=None,
                                sleepTime=None,
                                htmlWriteLocations=None,
                                matchWriteLocations=None,
                                pageIncrements=10)
     key = keys[0]
     soup = extractJobHTML(jobKey=key, tor=False, port=None, prettify=False)
     self.assertIsInstance(soup, bs4.BeautifulSoup)
 def test_matchClass(self):
     keys = batchExtractMatches(numPages=1,
                                searchTerm="software engineer",
                                searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                tor=False,
                                port=9050,
                                matchSingleWriteLocation=None,
                                userAgent=None,
                                sleepTime=None,
                                htmlWriteLocations=None,
                                matchWriteLocations=None,
                                pageIncrements=10)
     key = keys[0]
     soup = extractJobHTML(jobKey=key, tor=False, port=None, prettify=False)
     description = str(
         soup.find_all(match_class(["jobsearch-JobComponent-description"])))
     self.assertIn("class=\"jobsearch-JobComponent-description",
                   description)
 def test_getTags(self):
     keys = batchExtractMatches(numPages=1,
                                searchTerm="software engineer",
                                searchRegexEx=os.environ["JOB_KEYS_REGEX"],
                                tor=False,
                                port=9050,
                                matchSingleWriteLocation=None,
                                userAgent=None,
                                sleepTime=None,
                                htmlWriteLocations=None,
                                matchWriteLocations=None,
                                pageIncrements=10)
     key = keys[0]
     soup = extractJobHTML(jobKey=key, tor=False, port=None, prettify=False)
     testTags = {
         "commonWords": ["a", "the", "an", "by", "for", "but"],
         "software": ["software"]
     }
     matchedTags = getTags(soup=soup,
                           tags=testTags,
                           replaceDict=replaceDict)
     self.assertIn("commonWords", matchedTags)
     self.assertIn("software", matchedTags)
def testTask(self, jobId):
    print("Starting job: ", jobId)

    scraperTaskDetails = ScraperTask.objects(pk=ObjectId(jobId))[0]
    print("DETAILS:", scraperTaskDetails)
    jobTitle, jobAliases, skills = scraperTaskDetails.jobTitle, scraperTaskDetails.jobAliases, scraperTaskDetails.skills
    active, username, taskName, cities = scraperTaskDetails.active, scraperTaskDetails.username, scraperTaskDetails.taskName, scraperTaskDetails.selectedCities
    # print("Task details: ", jobTitle, jobAliases, skills, active, username, taskName)


    # Format variables for scraping
    searchTerms = [jobTitle] + [ x for x in jobAliases if x != '']
    # print("searchTerms: ", searchTerms)

    os.environ["JOB_KEYS_REGEX"] = "jobKeysWithInfo\[\'([A-Za-z0-9]+)\'\] \="

    jobKeys = []
    jobs = []
    TOR_ON = os.environ['TOR_ON']=='True'
    PAGES_PER_SCRAPE = int(os.environ['PAGES_PER_SCRAPE'])
    SLEEP_TIME_POST_TASK = int(os.environ['SLEEP_TIME_POST_TASK'])
    completedEndPoint = "http://backend:8080/api/v1/scrapertask"

    if os.environ['SCRAPING_ON']=='True':
        try:
            # Extract job keys (Indeed's unique identifiers for job posts)
            for searchTerm in searchTerms:
                # print("WORKING ON SEARCH TERM: ", searchTerm)
                jobKeys += batchExtractMatches(numPages=PAGES_PER_SCRAPE, searchTerm=searchTerm, searchRegexEx=os.environ['JOB_KEYS_REGEX'], cities=cities,
                                tor=TOR_ON, port=9051, matchSingleWriteLocation=None, userAgent=None, sleepTime=int(os.environ['SLEEP_TIME']), htmlWriteLocations=None, 
                                matchWriteLocations=None, pageIncrements=10)
                print("JOB KEYS ARE NOW: ", jobKeys)
            
            # Remove job keys for jobs that have already been scraped
            duplicateJobs = JobPost.objects(jobkey__in=jobKeys)
            duplicateJobs = [ job.jobkey for job in duplicateJobs ]
            print("Removing jobkeys that have already been scraped: {}".format(duplicateJobs))
            finalKeys = []
            for key in jobKeys:
                if key not in duplicateJobs:
                    finalKeys.append(key)
            print("Final list of jobkeys to scrape: {}".format(finalKeys))


            # Extract info associated with each job key
            technologies = { x[0]:x for x in skills }
            jobs = batchExtractJobInfo(jobkeys=finalKeys, replaceDict=replaceDict, technologies=technologies, sleepTime=int(os.environ['SLEEP_TIME']), tor=TOR_ON, port=9051)
            print("Extracted info for {} jobs".format(len(jobs)))
            if jobs:
                print("First job extracted: ", jobs[0])
            
            # Insert to DB
            for job in jobs:
                nextJob = createJobPost(scraperTaskId=jobId, posted=job['posted'], city=job['city'], 
                                        state=job['state'], technologies=job['technologies'], jobkey=job['jobkey'], 
                                        taskName=taskName, username=username, title=job['title'], company=job['company'], 
                                        experience=job['experience'])
                nextJob.save()
        except:
            # Notify backend
            print("Unexpected error:", sys.exc_info()[0])
            requests.post(completedEndPoint, json={"jobId": jobId, "status": "failed", "error": sys.exc_info()[0]})

    else:
        print("SCRAPING SWITCHED OFF")

    # Additional sleep time before reporting back
    time.sleep(choice([SLEEP_TIME_POST_TASK]))

    # Notify backend
    requests.post(completedEndPoint, json={"jobId": jobId, "numNewPosts": len(jobs), "status": "success", "error": "None"})
# EOF