Exemplo n.º 1
0
def route_compfileread(filepaths):
    for filepath in filepaths:
        try:
            # extracting data from .gz file.
            gzipfile = gzip.GzipFile(filepath, 'rb')
            gzipdata = gzipfile.read()
            gzipfile.close()

            # getting complete file name of the .gz file
            compfilename = utility.filename_from_filepath(filepath)
            # extracting the original file name
            filename = compfilename.split('.gz')[0]
            print(filename)

            # creating file and writing data
            uncompfile = open(
                config.ConfigManager().PCRatesFileFolder + '/' + filename,
                'wb')
            uncompfile.write(gzipdata)
            uncompfile.close()

        except BaseException as ex:
            utility.log_exception_file_and_filepath(
                ex,
                config.ConfigManager().PromptcloudLogFile, filepath)
        os.remove(filepath)
Exemplo n.º 2
0
def route_compfileread(filepaths):
    for filepath in filepaths:
        try:
            # extracting data from .gz file.
            gzipfile = gzip.GzipFile(filepath, 'rb')
            gzipdata = gzipfile.read()
            gzipfile.close()

            # getting complete file name of the .gz file
            compfilename = utility.filename_from_filepath(filepath)
            # extracting the original file name
            filename = compfilename.split('.gz')[0]
            print(filename)

            # creating file and writing data
            uncompfile = open(
                config.ConfigManager().PCFileFolder + '/' + filename, 'wb')
            uncompfile.write(gzipdata)
            uncompfile.close()

        except BaseException as ex:
            utility.log_exception_with_filepath(ex, filepath)
            # writing to file the file names that cannot be extracted using
            # gzip
            utility.write_to_file(
                config.ConfigManager().PCDataAnalysisResultsFile, 'a',
                compfilename + '  cannot be extracted')
        os.remove(filepath)
Exemplo n.º 3
0
def fill_job_by_site(filepath):
    global totaljobsdict
    # site name is the string before _ in the file name
    site = (utility.filename_from_filepath(filepath)).split('_')[0]

    # condition to add dictionary key as site name and incrementing
    if site not in totaljobsdict:
        totaljobsdict[site] = 1
    else:
        totaljobsdict[site] += 1
    return totaljobsdict
Exemplo n.º 4
0
def route_dataread(filepaths):
    data_read_count = int(utility.read_from_file(
        config.ConfigManager().ExecutioncountFile, 'r'))
    file_read_count = 0
    file_path_count = 0
    configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager().MongoDBPort), config.ConfigManager(
    ).DataCollectionDB, config.ConfigManager().ConfigCollection)
    docid_count = int(configdocs[0]['docid_count'])
    connection = dbmanager.mongoDB_connection(
        int(config.ConfigManager().MongoDBPort))
    utility.write_to_file(config.ConfigManager().LogFile,
                          'a', 'dataread running')
    for filepath in filepaths:
        data_text = ''
        try:
            file_path_count += 1
            print('File number: ' + str(file_path_count))
            print('Processing file..' + filepath)
            if filepath[-4:].lower() == ".txt":
                data_text = datareadfiletypes.read_text_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".pdf":
                data_text = datareadfiletypes.read_pdf_text(
                    filepath, data_text)
            elif filepath[-5:].lower() == ".docx":
                data_text = datareadfiletypes.read_docx_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".doc":
                data_text = datareadfiletypes.read_doc_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".xls":
                # data_text = datareadfiletypes.read_excel_text(
                    # filepath, data_text)
                docid_count = custom.process_excel_rowdata(
                    filepath, docid_count)
            elif filepath[-5:].lower() == ".xlsx":
                # data_text = datareadfiletypes.read_excel_text(
                    # filepath, data_text)
                docid_count = custom.process_excel_rowdata(
                    filepath, docid_count)
            elif filepath[-4:].lower() == ".csv":
                data_text = datareadfiletypes.read_csv_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".odt":
                data_text = datareadfiletypes.read_odt_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".xml":
                docid_count = custom.process_xml_data(filepath, docid_count)
            if not data_text == '':
                docid_count += 1
                file_read_count += 1
                # dcrnlp.extract_nounphrases_sentences(data_text)
                noun_phrases = ''
                dictionaries.DataProperties['description'] = data_text
                dictionaries.DataProperties['nounPhrases'] = noun_phrases
                dictionaries.DataProperties[
                    'documentType'] = utility.filefolder_from_filepath(filepath)
                dictionaries.DataProperties[
                    'dataSource'] = config.ConfigManager().Misc  # config.ConfigManager().JobPortal
                dictionaries.DataProperties['doc_id'] = docid_count
                dictionaries.DataProperties[
                    'documentTitle'] = utility.filename_from_filepath(filepath)
                dictionaries.DataProperties['documentDesc'] = (
                    dictionaries.DataProperties['description'])[0:200]
                jsonfordatastore = custom.prepare_json_for_datastore(
                    dictionaries.DataProperties)
                jsonfordatastore_deserialized = utility.jsonstring_deserialize(
                    jsonfordatastore)
                custom.insert_data_to_DB(
                    jsonfordatastore_deserialized, connection)
                phrases_file_data = custom.prepare_phrases_file_data(
                    noun_phrases, data_read_count, file_read_count)
                utility.write_to_file(
                    config.ConfigManager().PhraseFile, 'a', phrases_file_data)
        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + \
                str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + filepath + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            utility.write_to_file(
                config.ConfigManager().LogFile, 'a', exception_message)

    data_read_count += 1
    utility.write_to_file(config.ConfigManager(
    ).ExecutioncountFile, 'w', str(data_read_count))
    dictionaries.UpdateTemplateWhere['_id'] = configdocs[0]['_id']
    dictionaries.UpdateTemplateSet['docid_count'] = docid_count
    dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet
    custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager(
    ).ConfigCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet, connection)
if __name__ == "__main__":
    filepaths = []
    directory_list = []
    # print(config.ConfigManager().fileDirectory)
    directory_list = utility.string_to_array(
        config.ConfigManager().fileDirectory, ',', directory_list)
    filepaths = filemanager.directory_iterate(directory_list)
    detection_dict_list = []
    detection_dict_list_all = []

    for filepath in filepaths:
        strtimestamp = str(datetime.datetime.now())
        data_text = ''
        # flags = []
        filepath_mod = filepath.replace('\\', '!@#$%')
        file_batchId_name = utility.filename_from_filepath(filepath_mod)
        batchIdsupplierId = (file_batchId_name[1:]).split('_')[0]
        batchId = (batchIdsupplierId).split('-')[0]
        supplierId = int((batchIdsupplierId).split('-')[1])
        file_name = (file_batchId_name[1:]).split('_')[1]
        file_name = file_name.replace('!@#$%', '\\')
        try:
            # file_count += 1
            # print(filepath)
            if filepath[-4:].lower() == ".txt":
                flagsInfo = component.read_text_text(filepath, supplierId)
                print(flagsInfo)
                flags = flagsInfo[0]
                flagsDetails = flagsInfo[1]
                flagsDetailsBoolean = bool(flagsDetails)
                if flagsDetailsBoolean:
Exemplo n.º 6
0
def route_dataread(filepaths):
    file_count = 0
    docreadcount = 0
    antiwordemptycount = 0
    connection = dbmanager.mongoDB_connection(
        int(config.ConfigManager().MongoDBPort))
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'resumeread running' + ' ' + str(datetime.datetime.now()))
    for filepath in filepaths:
        strtimestamp = str(datetime.datetime.now())
        data_text = ''
        try:
            file_count += 1

            if filepath[-4:].lower() == ".txt":
                data_text = datareadfiletypes.read_text_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".pdf":
                data_text = datareadfiletypes.read_pdf_text(
                    filepath, data_text)
            elif filepath[-5:].lower() == ".docx":
                data_text = datareadfiletypes.read_docx_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".doc":
                data_text = datareadfiletypes.read_doc_text(
                    filepath, data_text)
                if data_text == '':
                    antiwordemptycount += 1
                    data_text = datareadfiletypes.read_doc_text_catdoc(
                        filepath, data_text)
                if data_text == '':
                    utility.write_to_file(config.ConfigManager().LogFile, 'a',
                                          'Filepath is: ' + filepath)
            elif filepath[-4:].lower() == ".xls":
                data_text = datareadfiletypes.read_excel_text(
                    filepath, data_text)
            elif filepath[-5:].lower() == ".xlsx":
                data_text = datareadfiletypes.read_excel_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".csv":
                data_text = datareadfiletypes.read_csv_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".odt":
                data_text = datareadfiletypes.read_odt_text(
                    filepath, data_text)
            elif filepath[-4:].lower() == ".xml":
                docid_count = custom.process_xml_data(filepath, docid_count)
            strtimestamp += ' ' + str(datetime.datetime.now())
            if not data_text == '':
                filepath = filepath.replace('\\', '')
                file_name = utility.filename_from_filepath(filepath)
                candidateId = (file_name[1:]).split('_')[0]
                print(candidateId)
                UpdateTemplateWhere = utility.clean_dict()
                UpdateTemplateSet = utility.clean_dict()
                UpdateTemplateWhere['candidateid'] = int(candidateId)
                # UpdateTemplateWhere['documentType'] = 'candidate details'
                # UpdateTemplateWhere['dataSource'] = 'Smart Track'
                UpdateTemplateSet['resumeText'] = data_text
                UpdateTemplateSet['isResumeTextNew'] = 1
                DBSet = utility.clean_dict()
                DBSet['$set'] = UpdateTemplateSet
                custom.update_data_to_Db_noupsert(
                    int(config.ConfigManager().MongoDBPort),
                    config.ConfigManager().DataCollectionDB,
                    config.ConfigManager().STCandidateCollection,
                    UpdateTemplateWhere, DBSet, connection)
                strtimestamp += ' ' + str(datetime.datetime.now())
                if filepath[-4:].lower() == ".doc":
                    docreadcount += 1
                    print('Total doc read count:' + str(docreadcount))
            print('File : ' + str(file_count) + ' ' + strtimestamp)
            print('Antiword empty count:' + str(antiwordemptycount))
        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + \
                str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            # .encode('utf8'))
            utility.write_to_file(config.ConfigManager().LogFile, 'a',
                                  exception_message)
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a', 'Number of resumes read - ' +
        str(file_count) + ' ' + str(datetime.datetime.now()))