Exemplo n.º 1
0
    def browse(self, lineEdit, fileType):
        if fileType == "dir":
            fname = QFileDialog.getExistingDirectory(self, "Browse Directory",
                                                     get_root_path(),
                                                     QFileDialog.ShowDirsOnly)
            fname += "\\"
        else:
            fname, filter = QFileDialog.getOpenFileName(
                self, "Browse Files", get_root_path(),
                "Files (" + fileType + ")")

        lineEdit.setText(os.path.normpath(fname))
Exemplo n.º 2
0
def main_excel(config_path, stopword_path, report_path, category, location,
               msgLabel, current_datetime):
    REPORT_NAME = category.replace(' ', '') + "JobReport_" + location.replace(
        ' ', '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".xlsx"
    msg = ""

    try:
        config_reader = cr.Csv_Reader(config_path)
        msg += "Start scraping at " + str(
            current_datetime
        ) + "\nReport path at " + report_path + REPORT_NAME + "\n"
        msgLabel.setText(msg)
        engine = se.Scraping_Engine(config_reader.get_data(),
                                    category,
                                    location,
                                    excel=True)
        engine.scrape_all()

        msg += "Scrape completed\nStart Excel report generation\n"
        msgLabel.setText(msg)
        excel_writer = ew.Excel_Writer(REPORT_NAME, report_path)
        excel_writer.write(engine.site_list, engine.company_list,
                           engine.title_list, engine.description_list,
                           engine.skills_list, engine.location_list,
                           engine.link_list)
        msg += "Excel report generation completed"
        msgLabel.setText(msg)
    except Exception:
        log(get_root_path() + 'Log\\', traceback.format_exc(),
            current_datetime)
        raise
Exemplo n.º 3
0
def main(config_path, stopword_path, report_path, category, location, msgLabel,
         current_datetime):
    REPORT_NAME = category.replace(' ', '') + "JobReport_" + location.replace(
        ' ', '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt"
    REPORT_TITLE = category + " Job Report in " + location + " on " + current_datetime.strftime(
        "%d/%m/%Y %H:%M:%S")
    KEYWORD_NAME = category.replace(
        ' ', '') + "JobKeyword_" + location.replace(
            ' ',
            '') + "_" + current_datetime.strftime("%Y%m%d_%H%M%S") + ".txt"
    KEYWORD_TITLE = category + " Job Keyword in " + location + " on " + current_datetime.strftime(
        "%d/%m/%Y %H:%M:%S")
    SEPARATOR = '=======================================================================================================================================================\n\n'
    msg = ""

    try:
        config_reader = cr.Csv_Reader(config_path)
        msg += "Start scraping at " + str(
            current_datetime
        ) + "\nReport path at " + report_path + REPORT_NAME + "\n"
        msgLabel.setText(msg)
        engine = se.Scraping_Engine(config_reader.get_data(), category,
                                    location)
        engine.scrape_all()
        report_writer = rw.Report_Writer(REPORT_NAME, report_path)
        report_writer.write_title(REPORT_TITLE)
        report_writer.write_list(engine.site_list, engine.title_list,
                                 engine.company_list, engine.location_list,
                                 engine.description_list, engine.link_list,
                                 [SEPARATOR] * len(engine.site_list))
        msg += "Scrape completed\nStart keyword extraction\n"

        msgLabel.setText(msg)
        extractor = ke.KeywordExtractor(stopword_path, engine.description_list,
                                        engine.title_list, engine.company_list,
                                        engine.location_list,
                                        category + ' ' + location)
        extractor.extract_each_text()
        keyword_writer = rw.Report_Writer(KEYWORD_NAME, report_path)
        keyword_writer.write_title(KEYWORD_TITLE)
        keyword_writer.write_list(engine.site_list, engine.title_list,
                                  engine.company_list, engine.location_list,
                                  extractor.keyword_list, engine.link_list,
                                  [SEPARATOR] * len(engine.site_list))
        extractor.extract_all_text()
        keyword_writer.write_text("Keywords of today's scraping: ")
        keyword_writer.write_list(extractor.keyword_list,
                                  [SEPARATOR] * len(extractor.keyword_list))
        msg += "Keyword extraction completed"
        msgLabel.setText(msg)
    except Exception:
        log(get_root_path() + 'Log\\', traceback.format_exc(),
            current_datetime)
        raise
Exemplo n.º 4
0
    def __init__(self):
        # The "super" function initialises the child class using the initialisation of the parent class,
        super(ScrapingGUI, self).__init__()
        self.setWindowFlags(self.windowFlags() | QtCore.Qt.WindowSystemMenuHint
                            | QtCore.Qt.WindowMinMaxButtonsHint)
        # Loads the UI
        loadUi(get_root_path() + 'src\\resources\\scraping.ui', self)

        # Basic button settings
        self.configLineEdit.setText(get_root_path() +
                                    'Config\\job_ad_sites.csv')
        self.stopwordLineEdit.setText(get_root_path() +
                                      'Config\\stopwords.txt')
        self.reportLineEdit.setText(get_root_path() + 'Report\\')

        # Browse for file
        self.configButton.clicked.connect(self.browse_config)
        self.stopwordButton.clicked.connect(self.browse_stopword)
        self.reportButton.clicked.connect(self.browse_report)

        self.startScrapeButton.clicked.connect(self.start_scrape)
        self.startWriteExcelButton.clicked.connect(self.start_excel)
class Test_Report_Writer(unittest.TestCase):

    TEST_FILENAME = 'Report_test.txt'
    TEST_PATH = get_root_path() + 'Report\\'
    TEST_TITLE = 'Test title'
    TEST_STRING1 = '1'
    TEST_STRING2 = '2'
    TEST_STRING3 = '3'
    TEST_STRING4 = '4'

    def setUp(self):
        self.test_report_writer = rw.Report_Writer(self.TEST_FILENAME,
                                                   self.TEST_PATH)

    def test_write_title(self):
        self.test_report_writer.write_title(self.TEST_TITLE)
        expected_input = self.TEST_TITLE + '\n====================================================\n'

        with open(self.TEST_PATH + self.TEST_FILENAME, 'r',
                  encoding='utf-8') as f:
            self.assertEqual(expected_input, f.read())

    def test_write_list(self):
        test_list_1 = [self.TEST_STRING1, self.TEST_STRING3]
        test_list_2 = [self.TEST_STRING2, self.TEST_STRING4]

        self.test_report_writer.write_list(test_list_1, test_list_2)

        expected_input = 'Number: 1\n' + self.TEST_STRING1 + self.TEST_STRING2 + 'Number: 2\n' + self.TEST_STRING3 + self.TEST_STRING4

        with open(self.TEST_PATH + self.TEST_FILENAME, 'r',
                  encoding='utf-8') as f:
            self.assertEqual(expected_input, f.read())

    def test_write_text(self):
        self.test_report_writer.write_text(self.TEST_TITLE)
        expected_input = self.TEST_TITLE + '\n'

        with open(self.TEST_PATH + self.TEST_FILENAME, 'r',
                  encoding='utf-8') as f:
            self.assertEqual(expected_input, f.read())

    def tearDown(self):
        os.remove(self.TEST_PATH + self.TEST_FILENAME)
    def setUp(self):
        self.TEST_FILENAME = get_root_path() + 'Config\\stopwords_test.txt'
        with open(self.TEST_FILENAME, 'w+', encoding='utf-8') as f:
            f.write('hello\nworld')

        self.expected_stop_word_set = frozenset({'hello', 'world'})
        self.test_list = [
            'hello world', 'this is a testing class', 'this', 'ends'
        ]
        self.omit_list = ['hello world'] * 4
        row = np.array([0, 3, 1, 0])
        col = np.array([0, 3, 1, 2])
        data = np.array([4, 5, 7, 9])
        self.test_matrix = coo_matrix((data, (row, col)), shape=(4, 4))

        self.test_extractor = ke.KeywordExtractor(
            self.TEST_FILENAME, self.test_list, self.omit_list, self.omit_list,
            self.omit_list, self.omit_list[0])
        self.actual_tuple = self.test_extractor._sort_coo(self.test_matrix)
Exemplo n.º 7
0
 def setUp(self):
     self.test_scraping_engine = se.Scraping_Engine(pd.read_csv(get_root_path() + 'Config\\job_ad_sites.csv'), self.TEST_CATEGORY, self.TEST_LOCATION)