Exemplo n.º 1
0
    def _get_language_data_path(
            self,
            file_service: FileService,
            run_type: RunType):
        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not os.path.exists(language_data_path):
            challenge_path = file_service.get_challenge_path()
            full_data_path = os.path.join(challenge_path, 'full')
            if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0:
                newseye_path = os.path.join('data', 'newseye')
                trove_path = os.path.join('data', 'trove')
                # ocr_download.combine_data(challenge_path, newseye_path, trove_path)
                # TODO Fix download

            pickles_path = file_service.get_pickles_path()
            train_data_path = file_service.get_pickles_path()
            preprocess_data(
                self._tokenize_service,
                self._metrics_service,
                self._vocabulary_service,
                pickles_path,
                full_data_path,
                output_data_path)

        return language_data_path
Exemplo n.º 2
0
def process_file():
    if not request.json or not request.json['fileId']:
        requestData = request.json or str(request.form) or request.data
        return make_response('Invalid content: ' + requestData, 400)

    db = MongoInit().initialize()

    payload = { 'fileId': request.json['fileId'], 'user': get_current_user().id }

    fileService = FileService(db)
    messageService = MessageService(db)
    
    chunks = fileService.getChunksByFileId(payload['fileId'])

    messages = []
    if all(c.user == payload['user'] for c in chunks):
        messages = messageService.parseFileChunks(chunks)

    topMessages = []

    for message in messages[:50]:
        topMessages.append({'subject': message.subject, 'sender': message.sender, 'content': message.content, 'date': message.date})

    result = {'fileId': payload['fileId'], 'messages': topMessages}

    return make_response(jsonify(result))
Exemplo n.º 3
0
def create_file():
    if not request.json or not request.json['data']:
        requestData = request.json or str(request.form) or request.data
        return make_response('Invalid content: ' + requestData, 400)

    db = MongoInit().initialize()
    
    payload = { 'fileId': request.json['fileId'], 'data': request.json['data'], 'position': request.json['position'], 'user': get_current_user().id }

    fileService = FileService(db)
    
    file = fileService.processFileChunk(None, payload['fileId'], payload['user'], payload['data'], payload['position'])
    
    return make_response(jsonify({'fileId': file._id, 'position': file.position}), 201)
Exemplo n.º 4
0
    def _get_language_data_path(self, file_service: FileService,
                                run_type: RunType):
        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not os.path.exists(language_data_path):
            train_data_path = file_service.get_pickles_path()
            test_data_path = None
            preprocess_data(train_data_path, test_data_path, output_data_path,
                            self._tokenize_service.tokenizer,
                            self._vocabulary_service)

        return language_data_path
Exemplo n.º 5
0
    def __init__(
            self,
            language: str,
            arguments_service: PostOCRArgumentsService,
            file_service: FileService,
            tokenize_service: BaseTokenizeService,
            run_type: RunType,
            **kwargs):
        super(NewsEyeDataset, self).__init__()

        self._arguments_service = arguments_service

        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not tokenize_service.is_tokenizer_loaded():
            full_data_path = os.path.join(
                'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_full_22M_without_Finnish')

            vocabulary_size = tokenize_service.vocabulary_size
            train_spm_model(full_data_path, output_data_path, language, vocabulary_size)
            tokenize_service.load_tokenizer_model()

        if not os.path.exists(language_data_path):
            train_data_path = os.path.join(
                'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_training_18M_without_Finnish')
            test_data_path = os.path.join(
                'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')

            preprocess_data(language, train_data_path, test_data_path,
                            output_data_path, tokenize_service.tokenizer)

        with open(language_data_path, 'rb') as data_file:
            self._language_data: LanguageData = pickle.load(data_file)
    def __init__(self, language: Language,
                 arguments_service: PretrainedArgumentsService,
                 tokenize_service: BaseTokenizeService,
                 file_service: FileService,
                 vocabulary_service: VocabularyService, **kwargs):
        super(SemEvalTestDataset, self).__init__()

        self._arguments_service = arguments_service

        challenge_path = file_service.get_challenge_path()
        targets_path = os.path.join(challenge_path, 'eval', str(language),
                                    'targets.txt')

        with open(targets_path, 'r', encoding='utf-8') as targets_file:
            self._target_words = targets_file.read().splitlines()
            self._target_words.sort(key=lambda v: v.upper())

        # English words end with POS tags (e.g. 'test_nn')
        if language == Language.English:
            target_words = [x[:-3] for x in self._target_words]
        else:
            target_words = self._target_words

        if arguments_service.include_pretrained_model:
            encodings = tokenize_service.encode_sequences(target_words)
            self._target_word_ids = [x[0] for x in encodings]
        else:
            self._target_word_ids = [
                vocabulary_service.string_to_id(target_word)
                for target_word in target_words
            ]
Exemplo n.º 7
0
 def post(self):
     files = request.files.getlist('files')
     file_service = FileService()
     # dict = file_service.uploan_photo(files)
     # 线上用下面的方法,地址头改为https://s3-ap-northeast-1.amazonaws.com/tokenpark-test/
     dict = file_service.uploan_photo_s3_session(files)
     flag = dict["flag"]
     file_path_list = dict["file_path_list"]
     if flag == 0:
         pass
     elif flag == 1:
         self.return_error(20001)
     elif flag == 2:
         self.return_error(20002)
     elif flag == 3:
         self.return_error(20003)
     return file_path_list
Exemplo n.º 8
0
    def _fetch_size(self):
        """
        Output the file size in formatted output.

        :return: file size in formatted output.
        """
        length = self._file_info.get(LENGTH)
        if not length:
            return None

        return FileService.calculate_formatted_size(length)
Exemplo n.º 9
0
    def _fetch_size(self):
        """
        Output the size of the torrent size.

        :return: the formatted size of torrent
        :rtype string
        """
        info = self._metainfo.get(INFO)
        if not info:
            return None

        piece_length = info.get(PIECE_LENGTH)
        if not piece_length:
            return None

        return FileService.calculate_formatted_size(piece_length)
Exemplo n.º 10
0
    def __init__(
            self, file_service: FileService, device: str,
            pretrained_representations_options: PretrainedRepresentationsOptions
    ):
        super().__init__()

        self._device = device
        self.do_not_save: bool = (
            not pretrained_representations_options.fine_tune_pretrained and
            not pretrained_representations_options.fine_tune_after_convergence)

        self._include_pretrained = pretrained_representations_options.include_pretrained_model
        self._pretrained_model_size = pretrained_representations_options.pretrained_model_size
        self._pretrained_weights = pretrained_representations_options.pretrained_weights
        self._pretrained_max_length = pretrained_representations_options.pretrained_max_length
        self._pretrained_model: PreTrainedModel = None

        self._fine_tune_pretrained = pretrained_representations_options.fine_tune_pretrained
        self._fine_tune_after_convergence = pretrained_representations_options.fine_tune_after_convergence

        self._include_fasttext_model = pretrained_representations_options.include_fasttext_model

        if self._include_pretrained and self._pretrained_model_size and self._pretrained_weights:
            if pretrained_representations_options.pretrained_model == PretrainedModel.BERT:
                self._pretrained_model = BertModel.from_pretrained(
                    pretrained_representations_options.pretrained_weights)
            elif pretrained_representations_options.pretrained_model == PretrainedModel.CamemBERT:
                self._pretrained_model = CamembertModel.from_pretrained(
                    pretrained_representations_options.pretrained_weights)

            if pretrained_representations_options.fine_tune_pretrained:
                self._pretrained_model.train()
            else:
                self._pretrained_model.eval()

        if self._include_fasttext_model:
            assert pretrained_representations_options.fasttext_model is not None, 'fast text model is not supplied when include-fasttext-model is set to true'

            data_path = file_service.get_initial_data_path()
            fasttext_path = os.path.join(
                data_path, 'fasttext',
                pretrained_representations_options.fasttext_model)
            assert os.path.exists(
                fasttext_path), f'fast text model not found in {fasttext_path}'

            self._fasttext_dimension = pretrained_representations_options.fasttext_model_size
            self._fasttext_model = fasttext.load_model(fasttext_path)
 def map_action(self) -> (str, bool):
     """
     This function maps the action to the corresponding method in FileService.
     Input:
         - action: str
         - args: Namespace
     Output:
         - str
         - str
         - bool
     """
     file_service = FileService(self.file, **self.args)
     try:
         response = getattr(file_service, self.action)()
     except:
         raise BaseException(
             "Unexpected action: '{}' does not map to controller"
             .format(
                 self.action
             )
         )
     return response
Exemplo n.º 12
0
    def __init__(self, arguments_service: PretrainedArgumentsService,
                 dataloader_service: DataLoaderService,
                 loss_function: LossBase, optimizer: OptimizerBase,
                 log_service: LogService, file_service: FileService,
                 model: ModelBase):

        self._arguments_service = arguments_service
        self._model_path = file_service.get_checkpoints_path()
        self._optimizer_base = optimizer

        self._log_service = log_service
        self._dataloader_service = dataloader_service

        self._loss_function = loss_function
        self._model = model.to(arguments_service.device)
        self.data_loader_train: DataLoader = None
        self.data_loader_validation: DataLoader = None

        self._initial_patience = self._arguments_service.patience
        # if we are going to fine-tune after initial convergence
        # then we set a low patience first and use the real one in
        # the second training iteration set
        if self._arguments_service.fine_tune_after_convergence:
            self._initial_patience = 5
Exemplo n.º 13
0
 def save_results(self, path=Defaults.output_path):
     output = 'image_id,category\n'
     for image in sorted(self._classified_images, key=lambda img: img.id):
         output += '{},{}\n'.format(image.id, image.classification)
     FileService().write_csv_file(path, output)
Exemplo n.º 14
0
 def load_images(self, path):
     image_data_array = FileService().read_h5_file(path)
     for id, image_data in enumerate(image_data_array):
         self._image_queue.enqueue(Image(id, image_data))
     self._image_queue.shuffle()
     self._get_next_image()
Exemplo n.º 15
0
from services.scrape_service import scrapeContent
from services.file_service import FileService
from services.email_service import sendMail
from services.argument_parser import ArgumentParser
from helper.url_builder import buildUrl

import numpy as np
import os

appPath = os.path.dirname(os.path.abspath(__file__))
fileService = FileService(appPath)

arguments = ArgumentParser()
url = buildUrl(arguments)

currentBikeList = scrapeContent(url)
storedBikeList = fileService.readFromFile(arguments.fileName)

newBikesExists = not np.array_equal(currentBikeList, storedBikeList)
if newBikesExists:
    newBikes = [item for item in currentBikeList if item not in storedBikeList]

    if len(newBikes):
        sendMail(arguments.email, newBikes)

    fileService.writeToFile(arguments.fileName, currentBikeList)


Exemplo n.º 16
0
from services.general_service import GeneralService
from services.file_service import FileService
from services.os_service import OSService
from services.music_service import MusicService
from services.simon_says_service import SimonSaysService
from services.quiz_service import QuizService
from assistant_logic.assistant import Assistant

# Program dependencies
assistant = Assistant()
os_service = OSService()
internet_service = InternetService()
general_service = GeneralService()
simon_says_service = SimonSaysService()
quiz_service = QuizService()
file_service = FileService("./appsettings.json")
appsettings = file_service.read_appsettings()
api_call_service = ApiCallService(appsettings["WeatherApiKey"],
                                  appsettings["WolframalphaApiKey"])
music_service = MusicService(appsettings["MusixmatchApiKey"])

# Startup
print("Loading your AI personal assistant Jarvis")
assistant.speak("Loading your AI personal assistant Jarvis")
assistant.wish_me()

print("\n     ###############     ")
print("     ##           ##     ")
print("     #  ~~     ~~  #     ")
print("     #  ()     ()  #     ")
print("     (      ^      )     ")
Exemplo n.º 17
0
    def __init__(self, arguments_service: NERArgumentsService,
                 vocabulary_service: VocabularyService,
                 file_service: FileService,
                 tokenize_service: BaseTokenizeService,
                 data_service: DataService, cache_service: CacheService,
                 string_process_service: StringProcessService):
        super().__init__()

        self._arguments_service = arguments_service
        self._tokenize_service = tokenize_service
        self._file_service = file_service
        self._data_service = data_service
        self._string_process_service = string_process_service

        self._entity_tag_types = arguments_service.entity_tag_types

        self._data_version = "1.3"
        self.PAD_TOKEN = '[PAD]'
        self.START_TOKEN = '[CLS]'
        self.STOP_TOKEN = '[SEP]'

        self.pad_idx = 0
        self.start_idx = 1
        self.stop_idx = 2

        data_path = file_service.get_data_path()
        language_suffix = self.get_language_suffix(arguments_service.language)

        train_cache_key = f'train-hipe-data-v{self._data_version}-limit-{arguments_service.train_dataset_limit_size}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}'
        validation_cache_key = f'validation-hipe-data-v{self._data_version}-limit-{arguments_service.validation_dataset_limit_size}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}'
        self._train_ne_collection = cache_service.get_item_from_cache(
            item_key=train_cache_key,
            callback_function=lambda:
            (self.preprocess_data(os.path.join(
                data_path,
                f'HIPE-data-v{self._data_version}-train-{language_suffix}.tsv'
            ),
                                  limit=arguments_service.
                                  train_dataset_limit_size)))

        self._validation_ne_collection = cache_service.get_item_from_cache(
            item_key=validation_cache_key,
            callback_function=lambda:
            (self.preprocess_data(os.path.join(
                data_path,
                f'HIPE-data-v{self._data_version}-dev-{language_suffix}.tsv'),
                                  limit=arguments_service.
                                  validation_dataset_limit_size)))

        if arguments_service.evaluate:
            test_cache_key = f'test-hipe-data-v{self._data_version}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}'
            self._test_ne_collection = cache_service.get_item_from_cache(
                item_key=test_cache_key,
                callback_function=lambda: (self.preprocess_data(
                    os.path.join(
                        data_path,
                        f'HIPE-data-v{self._data_version}-test-{language_suffix}.tsv'
                    ))))

        self._entity_mappings = self._create_entity_mappings(
            self._train_ne_collection, self._validation_ne_collection)

        vocabulary_cache_key = f'char-vocabulary-{self._data_version}'
        vocabulary_data = cache_service.get_item_from_cache(
            item_key=vocabulary_cache_key,
            callback_function=lambda: self._generate_vocabulary_data(
                language_suffix, self._data_version))

        vocabulary_service.initialize_vocabulary_data(vocabulary_data)