Exemplo n.º 1
0
 def test_to_json(self):
     logger.info('Loading Configuration..')
     configuration = Configuration(
         config_src=os.path.join(self.test_data_path, 'template_conf.yml'))
     expected_json = {
         'aws': [{
             'config': {
                 'access_key': 'access_key_1',
                 'secret_key': 'secret_key_1',
                 'instance_id': 'instance_id_1',
                 'ec2_region': 'ec2_region_1',
                 'ec2_amis': ['ec2_ami_1'],
                 'ec2_keypair': 'ec2_keypair_1',
                 'ec2_secgroups': ['ec2_secgroup_1'],
                 'ec2_instancetype': 'ec2_instancetype_1'
             }
         }],
         'mineserver': [{
             'config': {
                 'ssh_key_file_path': 'ssh_key_file_path_1',
                 'memory_allocation': 'memory_allocation_1'
             }
         }],
         'web_client': [{
             'config': {
                 'server_password': '******'
             }
         }]
     }
     # Compare
     logger.info('Comparing the results..')
     self.assertDictEqual(self._sort_dict(expected_json),
                          self._sort_dict(configuration.to_json()))
 def test_to_json(self):
     logger.info('Loading Configuration..')
     configuration = Configuration(
         config_src=os.path.join(self.test_data_path, 'template_conf.yml'))
     expected_json = {
         'tag':
         'production',
         'datastore': [{
             'config': {
                 'hostname': 'host123',
                 'username': '******',
                 'password': '******',
                 'db_name': 'db3',
                 'port': 3306
             },
             'type': 'mysql'
         }],
         'cloudstore': [{
             'config': {
                 'api_key': 'apiqwerty'
             },
             'type': 'dropbox'
         }]
     }
     # Compare
     logger.info('Comparing the results..')
     self.assertDictEqual(self._sort_dict(expected_json),
                          self._sort_dict(configuration.to_json()))
Exemplo n.º 3
0
    def __init__(self, language):
        """Initalization of WikiCorpus instance

        :language: unicode
        """
        # TODO: check if language is in dictionary of iso codes
        self._language = language

        # load configuration
        self._configuration = Configuration(WikiCorpus.CORPUS_CONFIG_PATH)
Exemplo n.º 4
0
def setup_classes(config_file: str, log: str, debug: bool):
    # Initialize
    _setup_log(log_path=log, debug=debug)

    # Load the configuration
    logger = logging.getLogger('Init')
    logger.debug("Loading the configs..")
    config = Configuration(config_src=config_file)
    aws_config = config.get_aws_configs()[0]
    mineserver_config = config.get_mineserver_configs()[0]
    web_client_config = config.get_web_client_configs()[0]
    web_client_config['permitted_days'] = [day.strip() for day in web_client_config['permitted_days'].split(',')]
    return logger, config, aws_config, mineserver_config, web_client_config
Exemplo n.º 5
0
 def test_to_yaml(self):
     logger.info('Loading Configuration..')
     configuration = Configuration(
         config_src=os.path.join(self.test_data_path, 'template_conf.yml'))
     # Modify and export yml
     logger.info('Changed the host and the api_key..')
     configuration.datastore[0]['config']['hostname'] = 'changedhost'
     logger.info('Exporting to yaml..')
     configuration.to_yaml(
         'test_data/test_configuration/actual_output_to_yaml.yml')
     # Load the modified yml
     logger.info('Loading the exported yaml..')
     modified_configuration = Configuration(config_src=os.path.join(
         self.test_data_path, 'actual_output_to_yaml.yml'))
     # Compare
     logger.info('Comparing the results..')
     expected_json = {
         'tag':
         'production',
         'datastore': [{
             'config': {
                 'hostname': 'changedhost',
                 'username': '******',
                 'password': '******',
                 'db_name': 'db3',
                 'port': 3306
             },
             'type': 'mysql'
         }]
     }
     self.assertDictEqual(self._sort_dict(expected_json),
                          self._sort_dict(modified_configuration.to_json()))
 def __init__(self, url):
     self.url = url
     self.doc = ""
     self.rawtext = ""
     self.data = set()
     self.config = Configuration()
     self.helper = Helper()
 def __init__(self, file, keywords):
     self.config = Configuration()
     self.keywords = keywords
     self.file = file
     self.urls = []
     self.data = []
     self.helper = Helper()
Exemplo n.º 8
0
 def __create_parser(configuration: Configuration):
     parser = None
     language = configuration.get_parser()
     if language == Languages.ENGLISH:
         parser = EnglishWiktionaryParser(configuration)
     elif language == Languages.POLISH:
         parser = PolishWiktionaryParser(configuration)
     return parser
Exemplo n.º 9
0
def init_main() -> Tuple[argparse.Namespace, Configuration]:
    args = _argparser()
    _setup_log(args.log, args.debug)
    logger.info("Starting in run mode: {0}".format(args.run_mode))
    # Load the configuration
    configuration = Configuration(config_src=args.config_file)

    return args, configuration
 def setUpClass(cls):
     cls._setup_log()
     if "DROPBOX_API_KEY" not in os.environ:
         logger.error('DROPBOX_API_KEY env variable is not set!')
         raise Exception('DROPBOX_API_KEY env variable is not set!')
     logger.info('Loading Configuration..')
     cls.configuration = Configuration(
         config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
Exemplo n.º 11
0
 def parse(self):
     return Configuration(original_repo=self.args['source_repo'],
                          new_repo_namespace=self.args['dst_repo'],
                          working_directory=self.args['cwd'],
                          sub_folder=self.args['sub_folder'],
                          allowed_folders=self.args['includes'],
                          not_allowed_folders=self.args['excludes'],
                          regex_for_folder_name=self.args['regexp'],
                          branch=self.args['branch'])
Exemplo n.º 12
0
 def setUpClass(cls):
     cls._setup_log()
     gmail_os_vars = ['EMAIL_ADDRESS', 'GMAIL_API_KEY']
     if not all(gmail_os_var in os.environ
                for gmail_os_var in gmail_os_vars):
         logger.error('Gmail env variables are not set!')
         raise Exception('Gmail env variables are not set!')
     logger.info('Loading Configuration..')
     cls.configuration = Configuration(
         config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
Exemplo n.º 13
0
    def __init__(self, language):
        """Initalization of WikiCorpus instance

        :language: unicode
        """
        # TODO: check if language is in dictionary of iso codes
        self._language = language

        # load configuration
        self._configuration = Configuration(WikiCorpus.CORPUS_CONFIG_PATH)
 def setUpClass(cls):
     cls._setup_log()
     if "DROPBOX_API_KEY" not in os.environ:
         logger.error('DROPBOX_API_KEY env variable is not set!')
         raise Exception('DROPBOX_API_KEY env variable is not set!')
     logger.info('Loading Configuration..')
     cls.configuration = Configuration(config_src=os.path.join(cls.test_data_path, 'template_conf_all_args.yml'))
     cls.remote_tests_folder = '/job_bot_tests'
     cloud_store = JobBotDropboxCloudstore(config=cls.configuration.get_cloudstores()[0])
     cloud_store.delete_file(cls.remote_tests_folder)
    def test_schema_validation(self):
        try:
            logger.info('Loading the correct Configuration..')
            Configuration(config_src=os.path.join(self.test_data_path,
                                                  'minimal_conf_correct.yml'),
                          config_schema_path=os.path.join(
                              '..', 'tests', self.test_data_path,
                              'minimal_yml_schema.json'))
        except ValidationError as e:
            logger.error('Error validating the correct yml: %s', e)
            self.fail('Error validating the correct yml')
        else:
            logger.info('First yml validated successfully.')

        with self.assertRaises(ValidationError):
            logger.info('Loading the wrong Configuration..')
            Configuration(config_src=os.path.join(self.test_data_path,
                                                  'minimal_conf_wrong.yml'))
        logger.info('Second yml failed to validate successfully.')
    def test_init(self):
        req_only_conf = Configuration(
            config_src=os.path.join(self.test_data_path, 'template_conf_required_args_only.yml'))

        cloud_store = JobBotDropboxCloudstore(config=self.configuration.get_cloudstores()[0],
                                              remote_files_folder=self.remote_tests_folder)
        boolean_attributes = [True if len(cloud_store.attachments_names) > 0 else False,
                              cloud_store._update_stop_words,
                              cloud_store._update_application_to_send_email,
                              cloud_store._update_inform_success_email,
                              cloud_store._update_inform_should_call_email]
        self.assertTrue(True, all(boolean_attributes))
        req_only_cloud_store = JobBotDropboxCloudstore(config=req_only_conf.get_cloudstores()[0],
                                                       remote_files_folder=self.remote_tests_folder)
        req_only_boolean_attributes = [True if len(req_only_cloud_store.attachments_names) == 0 else False,
                                       not req_only_cloud_store._update_stop_words,
                                       not req_only_cloud_store._update_application_to_send_email,
                                       not req_only_cloud_store._update_inform_success_email,
                                       not req_only_cloud_store._update_inform_should_call_email]
        self.assertTrue(True, all(req_only_boolean_attributes))
Exemplo n.º 17
0
 def create_config(data_loaded):
     migrator = data_loaded['migrator']
     return Configuration(
         original_repo=migrator['original_repo'],
         new_repo_namespace=migrator['new_repo_namespace'],
         working_directory=migrator['working_directory'],
         sub_folder=migrator['sub_folder'],
         allowed_folders=migrator['includes'],
         not_allowed_folders=migrator['excludes'],
         regex_for_folder_name=migrator['regex_folder_name'],
         branch=migrator['branch'])
Exemplo n.º 18
0
 def isConnected(self):
     config = Configuration()
     try:
         response = requests.get(config.global_check_url, timeout=2)
         status = "GLOBAL"
     except:
         try:
             response = requests.get(config.china_check_url, timeout=2)
             status = "CHINA"
         except:
             status = "NONETWORK"
     return status
Exemplo n.º 19
0
 def setUpClass(cls):
     cls._setup_log()
     mysql_os_vars = [
         'MYSQL_HOST', 'MYSQL_USERNAME', 'MYSQL_PASSWORD', 'MYSQL_DB_NAME'
     ]
     if not all(mysql_os_var in os.environ
                for mysql_os_var in mysql_os_vars):
         logger.error('Mysql env variables are not set!')
         raise Exception('Mysql env variables are not set!')
     logger.info('Loading Configuration..')
     cls.configuration = Configuration(
         config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
Exemplo n.º 20
0
 def test_to_yaml(self):
     logger.info('Loading Configuration..')
     configuration = Configuration(
         config_src=os.path.join(self.test_data_path, 'template_conf.yml'))
     # Modify and export yml
     logger.info('Changed the host and the api_key..')
     configuration.aws[0]['config']['access_key'] = 'access_key_2'
     configuration.mineserver[0]['config'][
         'ssh_key_file_path'] = 'ssh_key_file_path_2'
     logger.info('Exporting to yaml..')
     configuration.to_yaml(
         'test_data/test_configuration/actual_output_to_yaml.yml')
     # Load the modified yml
     logger.info('Loading the exported yaml..')
     modified_configuration = Configuration(config_src=os.path.join(
         self.test_data_path, 'actual_output_to_yaml.yml'))
     # Compare
     logger.info('Comparing the results..')
     expected_json = {
         'aws': [{
             'config': {
                 'access_key': 'access_key_2',
                 'secret_key': 'secret_key_1',
                 'instance_id': 'instance_id_1',
                 'ec2_region': 'ec2_region_1',
                 'ec2_amis': ['ec2_ami_1'],
                 'ec2_keypair': 'ec2_keypair_1',
                 'ec2_secgroups': ['ec2_secgroup_1'],
                 'ec2_instancetype': 'ec2_instancetype_1'
             }
         }],
         'mineserver': [{
             'config': {
                 'ssh_key_file_path': 'ssh_key_file_path_2',
                 'memory_allocation': 'memory_allocation_1'
             }
         }],
         'web_client': [{
             'config': {
                 'server_password': '******'
             }
         }]
     }
     self.assertDictEqual(self._sort_dict(expected_json),
                          self._sort_dict(modified_configuration.to_json()))
Exemplo n.º 21
0
Arquivo: main.py Projeto: drkostas/HGN
def setup() -> Tuple[Dict, Dict, Dict, Dict, str]:
    """Setup the configuration and the run properties."""

    args = _argparser()
    # Temporary logging
    # noinspection PyArgumentList
    logging.basicConfig(
        level=logging.INFO if not args.debug else logging.DEBUG,
        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
        handlers=[logging.StreamHandler()])
    # Load the configuration
    config = Configuration(config_src=args.config_file)
    spark_config = config.get_spark_configs()[0]
    input_config = config.get_input_configs()[0]
    run_options_config = config.get_run_options_configs()[0]
    output_config = config.get_output_configs()[0]
    options_id_name = "featMinAvg-{featMinAvg}_rLvl1-{rLvl1}_" \
                      "rLvl2-{rLvl2}_betwThres-{betwThres}_feats-{feats}" \
        .format(featMinAvg=run_options_config['feature_min_avg'],
                rLvl1=run_options_config['r_lvl1_thres'],
                rLvl2=run_options_config['r_lvl2_thres'],
                betwThres=run_options_config['betweenness_thres'],
                feats=''.join([feat[:10] for feat in run_options_config['features_to_check'][1:]]))
    modified_graph_name = os.path.join(input_config['name'], options_id_name)
    _setup_log(os.path.join(output_config['logs_folder'],
                            modified_graph_name + '.log'),
               debug=args.debug)
    return spark_config, input_config, run_options_config, output_config, modified_graph_name
Exemplo n.º 22
0
 def parse(json):
     training_input = np.array(list(json["input"].values())).transpose()
     predict_input = np.array(list(json["predict_input"].values())).transpose()
     return Configuration(
             training_input,
             np.array(json["output"]),
             [Layer(training_input.shape[0], "tanh")] + list(map(lambda layer: Layer(layer["neurons"], layer["activation"]), json["layers"])),
             int(json["iterations"]),
             float(json["learning_rate"]),
             predict_input,
             np.array(json["predict_output"]),
             float(json["predict_threshold"])
     )
Exemplo n.º 23
0
class WiktionaryParser:

    def __init__(self, configuration_path):
        self._configuration = Configuration(configuration_path)
        self._parser = self.__create_parser(self._configuration)

    @staticmethod
    def __create_parser(configuration: Configuration):
        parser = None
        language = configuration.get_parser()
        if language == Languages.ENGLISH:
            parser = EnglishWiktionaryParser(configuration)
        elif language == Languages.POLISH:
            parser = PolishWiktionaryParser(configuration)
        return parser

    def parse(self):
        logging.info('Start parser')
        start_time = time.time()
        events = {'start', 'end'}
        parser_result = {}
        last_page_title = None
        wiktionary_path = self._configuration.get_wiktionary_path()
        for event, elem in ET.iterparse(wiktionary_path, events=events):
            tag = self.__get_tag(elem)
            if self.__is_start_title(tag, event):
                last_page_title = elem.text
            elif tag == _TEXT and elem.text:
                text = elem.text
                if text:
                    result = self._parser.parse_text(text, last_page_title, parser_result)
                    if result:
                        parser_result[last_page_title] = result
            elem.clear()
        end_time = time.time()
        logging.info('Parsing time: {} s'.format(end_time - start_time))
        return parser_result

    @staticmethod
    def __get_tag(elem):
        return elem.tag.split('}')[1]

    @staticmethod
    def __is_start_title(tag, event):
        return tag == _TITLE and event == _START

    @staticmethod
    def __is_end_text(tag, event):
        return tag == _TEXT and event == _END
Exemplo n.º 24
0
def main():
    """
    Handles the core flow of SpotiClick.

    :Example:
    python main.py -c confs/raspotify_conf.yml
                   -l logs/spoticlick.log
    """

    # Initializing
    args = _argparser()
    _setup_log(args.log, args.debug)
    # Load the configuration
    configuration = Configuration(config_src=args.config_file)
    # Init Spotipy
    spoti_read_config = configuration.get_spotifies()[0]
    spoti_modify_config = configuration.get_spotifies()[1]
    target_device_id = spoti_modify_config["target_device_id"]
    spot_read = Spotipy(config=spoti_read_config, token_id='read')
    spot_modify = Spotipy(config=spoti_modify_config, token_id='modify')
    logger.info("Transferring music to device id: %s" % target_device_id)
    spot_modify.play_on_device(target_device_id=target_device_id,
                               session_info=spot_read.get_playback_info())
    logger.info("Music Transferred!")
Exemplo n.º 25
0
def main():
    """
    Handles the core flow of SpotiClick.

    :Example:
    python main.py -c confs/raspotify_conf.yml
                   -l logs/spoticlick.log
    """

    # Initializing
    args = _argparser()
    _setup_log(args.log, args.debug)
    # Load the configuration
    configuration = Configuration(config_src=args.config_file)
    # Init Spotipy
    spoti_read_config = configuration.get_spotifies()[0]
    spoti_modify_config = configuration.get_spotifies()[1]
    spot_read = Spotipy(config=spoti_read_config, token_id='read')
    spot_modify = Spotipy(config=spoti_modify_config, token_id='modify')
    logger.debug("%s volume by 5%%.." % ("Increasing" if args.volume_direction
                                         == 'increase' else "Decreasing"))
    spot_modify.volume_update(direction=args.volume_direction,
                              current_volume=spot_read.get_current_volume())
    logger.debug("Volume changed!")
Exemplo n.º 26
0
 def __init__(self, configuration: Configuration, handler: Handler = None):
     super().__init__((configuration.get_property("server.host"),
                       configuration.get_property("server.port")), handler)
     self.file_searcher = FileSearcher(True, False)
     self.running = False
     self.dr_collection = MongoClient(configuration.get_property("mongo.host"),
                                      (int)(configuration.get_property("mongo.port")))\
         .get_database(configuration.get_property("mongo.database")) \
         .get_collection(configuration.get_property("mongo.collection.drs"))
     self.dr_collection.create_index("messageId")
     self.dr_path = configuration.get_property("dr.path")
Exemplo n.º 27
0
def main(debug=False):

    configuration = Configuration()

    folder = FolderGenerator(configuration)
    folder.make_folder_structure()

    package_json = PackageJsonGenerator(configuration)
    package_json.generate_package_json()

    extension = ExtensionGenerator(configuration)
    extension.generate_extension_js()

    coloring = ColoringVSCode(configuration)
    coloring.do_coloring_for_vscode()

    outline = OutlineVSCode(configuration)
    outline.do_outline_for_vscode()
Exemplo n.º 28
0
    def save(self):
        config = Configuration()
        with sqlite.connect(config.db_file) as connection:
            cursor = connection.cursor()

            # get data ready to insert
            sentence = self.sentence
            keyword = self.keyword
            choices = ",".join(self.choices)

            # search for current sentence in database
            search_sql = "select rowid from questions where sentence=?"
            cursor.execute(search_sql, (sentence, ))
            connection.commit()
            search_result = cursor.fetchall()

            # if current sentence doesn't exist in database, insert it
            if len(search_result) == 0:
                sql = f"insert into questions values (?,?,?,0)"
                connection.execute(sql, (sentence, keyword, choices))
                connection.commit()
    def edit_config(self, subfolder):
        relative_dataset_lg_location = '../Data/Expressions/lg_output/'
        relative_dataset_location = '../Data/Expressions/inkml/'
        config_location = self.dprl_project_location + 'configs/full_system_infty.conf'
        config = Configuration.from_file(config_location)

        #inkml
        inkml_path = config.get_str('TESTING_DATASET_PATH')
        config.set('TESTING_DATASET_PATH',
                   relative_dataset_location + subfolder)
        inkml_path = config.get_str('TESTING_DATASET_PATH')
        config.write_to_file(config_location, 'TESTING_DATASET_PATH',
                             inkml_path)

        #lg
        lg_path = config.get_str('TESTING_DATASET_LG_PATH')
        config.set('TESTING_DATASET_LG_PATH',
                   relative_dataset_lg_location + subfolder + '_lg')
        lg_path = config.get_str('TESTING_DATASET_LG_PATH')
        config.write_to_file(config_location, 'TESTING_DATASET_LG_PATH',
                             lg_path)
Exemplo n.º 30
0
 def __init__(self):
     print()
     self.config = Configuration()
Exemplo n.º 31
0
#!/usr/bin/python3.5

import logging

from configuration.configuration import Configuration
from messaging.videoconversionmessaging import VideoConversionMessaging
from database.mongodb.videoconversion import VideoConversion
from videoconvunixsocket.videoconversionunixsocket import VideoConversionUnixSocket

if __name__ == '__main__':

    logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s',
                        level=logging.DEBUG)
    configuration = Configuration()

    #logging.info(configuration.get_rabbitmq_host())
    #logging.info(configuration.get_rabbitmq_port())
    #logging.info(configuration.get_messaging_conversion_queue())
    #logging.info(configuration.get_database_name())
    #logging.info(configuration.get_video_conversion_collection())

    video_unix_socket = VideoConversionUnixSocket()
    video_unix_socket.start()
    video_conversion_service = VideoConversion(configuration)
    video_messaging = VideoConversionMessaging(configuration,
                                               video_conversion_service)
    video_unix_socket.setVideoConversionMessaging(video_messaging)
Exemplo n.º 32
0
class WikiCorpus(object):

    """Class representing corpus from Wikipedia of one language """

    # configuration file
    CORPUS_CONFIG_PATH = project_path('wikicorpus/corpus-config.yaml')

    # original dump file name
    DUMP_ORIGINAL_NAME = 'pages-articles.xml.bz2'

    # dump url
    DUMP_URL_GENERAL = 'http://dumps.wikimedia.org/{lang}wiki/latest/'\
        + '{lang}wiki-latest-' + DUMP_ORIGINAL_NAME

    # md5 checksum file url
    MD5_URL_GENERAL = 'http://dumps.wikimedia.org/{lang}wiki/latest/'\
        + '{lang}wiki-latest-md5sums.txt'

    # Wikipedia namespace number label for articles
    ARTICLE_NS = '0'

    def __init__(self, language):
        """Initalization of WikiCorpus instance

        :language: unicode
        """
        # TODO: check if language is in dictionary of iso codes
        self._language = language

        # load configuration
        self._configuration = Configuration(WikiCorpus.CORPUS_CONFIG_PATH)

        # vertical info
        #self._tagset = None
        #self._structures = None  # always _BASIC_STRUCTURES

    # ------------------------------------------------------------------------
    # getters and setters
    # ------------------------------------------------------------------------

    def get_corpus_name(self):
        """ Returns corpus name
        """
        return self._configuration.get('corpus-name').format(
            lang=self.language())

    def get_dump_path(self):
        """ Returns path to dump
        """
        # full dumps are bzipped, while sample dumps are uncompressed
        if self.is_dump_compressed():
            ext = self._configuration.get('extensions', 'compressed-dump')
        else:
            ext = self._configuration.get('extensions', 'uncompressed-dump')

        # dump file name = corpus name + extension
        dump_file_name = '{name}.{ext}'.format(
            name=self.get_corpus_name(),
            ext=ext)

        # path = path to verticals + dump file name
        path = os.path.join(
            self.get_uncompiled_corpus_path(),
            dump_file_name)
        return path

    def get_dump_length(self):
        """Returns length of the dump

        Note: For compressed dumps, this is larger number than file size.
        """
        if self.is_dump_compressed():
            raise NotImplemented('calculated uncompressed dump length is not supported')
            #print 'Calculating uncompressed dump length...'
            #with self._open_dump() as dump_file:
            #    dump_file.seek(0, os.SEEK_END)
            #    length = dump_file.tell()
            #    return length
        else:
            return os.path.getsize(self.get_dump_path())

    def get_namespace(self):
        """Returns namespace of the wiki dump
        """
        with self._open_dump() as dump_file:
            # read first event, which is ('start', root element),
            context_for_ns = etree.iterparse(dump_file, events=('start',))
            _, root = context_for_ns.next()
            # get namespace information from the root element,
            # None means implicit namespace (without prefix)
            namespace = root.nsmap[None]
            del context_for_ns
        return namespace

    def get_prevertical_path(self):
        """ Returns path to prevertical
        """
        # prevertical file name = corpus name + extension
        prevertical_file_name = '{name}.{ext}'.format(
            name=self.get_corpus_name(),
            ext=self._configuration.get('extensions', 'prevertical'))

        # path = path to verticals + prevertical file name
        path = os.path.join(
            self.get_uncompiled_corpus_path(),
            prevertical_file_name)
        return path

    def get_registry_path(self):
        """ Returns path to registry file.

        It will also creates non-existing directories on this path
        """
        registry_dir = environment.registry_path()
        makedirs(registry_dir)
        path = os.path.join(
            registry_dir,
            self.get_corpus_name())
        return path

    #def get_tagset(self):
    #    """Returns tagset of the corpus.

    #    @return: [registry.tagsets.tagset] || None
    #    @throws: RegistryException
    #    """
    #    # first, if _tagset is None, update the tagset ifnormation
    #    if self._tagset is None:
    #        self._tagset = get_registry_tagset(self.get_registry_path())
    #    return self._tagset

    def get_url_prefix(self):
        """Returns url prefix for all articles in the corpus.
        """
        return 'http://{lang}.wikipedia.org/wiki'.format(lang=self.language())

    def get_vertical_path(self):
        """ Returns path to vertical
        """
        # vertical file name = corpus name + extension
        vertical_file_name = '{name}.{ext}'.format(
            name=self.get_corpus_name(),
            ext=self._configuration.get('extensions', 'vertical'))

        # path = path to verticals + vertical file name
        path = os.path.join(
            self.get_uncompiled_corpus_path(),
            vertical_file_name)
        return path

    def get_uncompiled_corpus_path(self):
        """ Returns path to directory with verticals for this corpus

        It will also creates non-existing directories on this path
        """
        path = os.path.join(
            environment.verticals_path(),
            self.get_corpus_name())
        makedirs(path)
        return path

    def get_compiled_corpus_path(self):
        """ Returns path to directory with compiled corpus

        It will also creates non-existing directories on this path
        """
        path = os.path.join(
            environment.compiled_corpora_path(),
            self.get_corpus_name())
        makedirs(path)
        return path

    #def is_sample(self):
    #    """ Returns True if this is a sample corpus
    #    """
    #    return bool(self.sample_size())

    def is_dump_compressed(self):
        """Returns True if dumps is compress, False otherwise.
        """
        # dumps for full languages are always compressed
        return True

    def language(self):
        """ Returns corpus language
        """
        return self._language

    def prevertical_file_exists(self):
        return os.path.exists(self.get_prevertical_path())

    def vertical_file_exists(self):
        return os.path.exists(self.get_vertical_path())

    # ------------------------------------------------------------------------
    #  corpus building methods
    # ------------------------------------------------------------------------

    def download_dump(self, force=False):
        """ Downloads dump of Wikipedia

        :force: Boolean
            if True, it downloads dump even if some dump with
            target name is already downloaded
        """
        # select dump path
        dump_path = self.get_dump_path()
        if os.path.exists(dump_path) and not force:
            logging.info('Dump {name} already exists.'.format(name=dump_path))
            return

        # select dump url
        dump_url = WikiCorpus.DUMP_URL_GENERAL.format(lang=self.language())

        logging.info('Started downloading {l}-wiki dump from {url}'
            .format(l=self.language(), url=dump_url))

        # find MD5 checksum
        md5_url = WikiCorpus.MD5_URL_GENERAL.format(lang=self.language())
        md5sums = get_online_file(md5_url, lines=True)
        for file_md5, file_name in map(lambda x: x.split(), md5sums):
            if file_name.endswith(WikiCorpus.DUMP_ORIGINAL_NAME):
                md5sum = file_md5
                break
        else:
            logging.warning('no matching MD5 checksum for the dump found')
            md5sum = None

        # downloading
        download_large_file(dump_url, dump_path, md5sum=md5sum)

        logging.info('Downloading of {lang}-wiki dump finished'.format(
            lang=self.language(),
            path=dump_path))

    def create_prevertical(self):
        """ Parses dump (outer XML, inner Wiki Markup) and creates prevertical
        """
        prevertical_path = self.get_prevertical_path()
        namespace = self.get_namespace()

        # create qualified names (= names with namespaces) for tags we need
        TEXT_TAG = qualified_name('text', namespace)
        TITLE_TAG = qualified_name('title', namespace)
        REDIRECT_TAG = qualified_name('redirect', namespace)
        NS_TAG = qualified_name('ns', namespace)

        logging.info('Preverticalization of {name} started...'.format(
            name=self.get_corpus_name()))

        # iterate through xml and build a sample file
        with open(prevertical_path, 'w') as prevertical_file:
            with self._open_dump() as dump_file:
                context = etree.iterparse(dump_file, events=('end',))
                #progressbar = ProgressBar(self.get_dump_length())
                last_title = None
                id_number = 0
                # skip first page in full (copressed) dump since it's Main Page
                skip = True if self.is_dump_compressed() else False

                # iterate through end-events
                for event, elem in context:
                    if elem.tag == REDIRECT_TAG:
                        # ignore redirect pages
                        skip = True
                    elif elem.tag == NS_TAG:
                        # ignore nonarticle pages (such as "Help:" etc.)
                        if elem.text != WikiCorpus.ARTICLE_NS:
                            skip = True
                    elif elem.tag == TITLE_TAG:
                        # remember the title
                        last_title = elem.text
                    elif elem.tag == TEXT_TAG:
                        if skip:
                            skip = False
                            continue
                        if not elem.text or not last_title:
                            continue
                        # new id
                        id_number += 1
                        parsed_doc = parse_wikimarkup(id_number, last_title,
                            self.get_url_prefix(), elem.text) + '\n'
                        prevertical_file.write(parsed_doc.encode('utf-8'))
                        # approximate work done by positin in dump file
                        #progressbar.update(dump_file.tell())

                    # cleanup
                    elem.clear()
                    #while elem.getprevious() is not None:
                    #    del elem.getparent()[0]
                    for ancestor in elem.xpath('ancestor-or-self::*'):
                        while ancestor.getprevious() is not None:
                            del ancestor.getparent()[0]
                del context
        #progressbar.finish()

        logging.info('Prevertical of {name} created at: {path}'.format(
            name=self.get_corpus_name(), path=prevertical_path))

    def create_vertical(self):
        """ Creates a vertical file.

        Performes tokenization of prevertical and for some languages
        also morfologization (adding morfological tag and lemma/lempos)

        NOTE: Kvuli bugu v TreeTaggeru je potreba udelat nechutny hack:
          1) provest v prevertikalu nasledujici substituci:
                </term>     --->  __TERM_END__
          2) nechat TreeTagger vytvorit vertikal
          3) presunout <term> a </term> na spravne misto s pouzitim vlozene
                znacky __TERM_END__
        """
        prevertical_path = self.get_prevertical_path()
        marked_prevert_path = prevertical_path + '.tmp'
        vertical_path = self.get_vertical_path()
        tmp_vertical_path =  vertical_path + '.tmp'
        # check if prevertical file already exists
        if not self.prevertical_file_exists():
            raise CorpusException('Verticalization failed: '
                + 'Missing prevertical file.')
        logging.info('Verticalization of {name} started...'.format(
            name=self.get_corpus_name()))
        try:
            if self.language() == 'en':
                # ----------------------------------------------------------
                # oprava bugu v treetaggeru, krok 1
                self._mark_terms(prevertical_path, marked_prevert_path)
                # ----------------------------------------------------------
                # create vertical file
                with NaturalLanguageProcessor(self.language()) as lp:
                    lp.create_vertical_file(marked_prevert_path, tmp_vertical_path)
                    #self._tagset = tags
                    #self._structures = WikiCorpus._BASIC_STRUCTURES
                # create registry file
                self.create_registry()
                # ----------------------------------------------------------
                # oprava bugu v treetaggeru, krok 3
                self._correct_terms(tmp_vertical_path, vertical_path)
                call(('rm', marked_prevert_path, tmp_vertical_path))
                # ----------------------------------------------------------
            else:
                with NaturalLanguageProcessor(self.language()) as lp:
                    lp.create_vertical_file(prevertical_path, vertical_path)
                self.create_registry()

            logging.info('Vertical of {name} created at: {path}'.format(
                name=self.get_corpus_name(),
                path=vertical_path))
        except ConfigurationException as exc:
            raise CorpusException('Verticalization failed: ' + exc.message)
        except LanguageProcessorException as exc:
            raise CorpusException('Verticalization failed: ' + exc.message)

    def _mark_terms(self, prevert_path, marked_prevert_path):
        cmd = "sed 's/<\/term>/ __TERM_END__/g' {fr} > {to}".format(
            fr=prevert_path, to=marked_prevert_path)
        task = Popen(cmd, shell=True)
        task.wait()
        if task.returncode != 0:
            raise CorpusException('sed error')

    def _correct_terms(self, input_path, output_path):
        last_term_line = None
        open_term = False
        #state = 0  # = pocet radku spatne posunuteho termu
        with open(input_path) as input_file:
            with open(output_path, 'w') as output_file:
                for encoded_line in input_file:
                    line = encoded_line.decode('utf-8')

                    if line.startswith('<term '):
                        last_term_line = encoded_line
                    elif line.startswith('</term>'):
                            # ignore
                            continue
                    elif line.startswith('<s>'):
                        output_file.write(encoded_line)
                        if last_term_line:
                            output_file.write(last_term_line)
                            last_term_line = None
                            open_term = True
                    elif line.startswith('__TERM_END__') and open_term:
                        output_file.write(str('</term>\n'))
                        open_term = False
                    elif line.startswith('<'):
                        output_file.write(encoded_line)
                    else:
                        if last_term_line:
                            output_file.write(last_term_line)
                            last_term_line = None
                            open_term = True
                        output_file.write(encoded_line)

                    # if state == 0 and line.startswith('<term '):
                    #     last_term_line = encoded_line
                    #     state += 1
                    # elif state == 1 and line.startswith('</term>'):
                    #     state += 1
                    # elif line.startswith('</term>'):
                    #     # ignore
                    #     continue
                    # elif state == 1 and line.startswith('__TERM_END__'):
                    #     # empty term
                    #     state = 0
                    #     last_term_line = None
                    # elif state == 2 and line.startswith('<s>'):
                    #     output_file.write(encoded_line)
                    #     output_file.write(last_term_line)
                    #     last_term_line = None
                    #     state = 0
                    # elif state == 2 and line.startswith('<'):
                    #     output_file.write(encoded_line)
                    # elif line.startswith('__TERM_END__'):
                    #     output_file.write(str('</term>\n'))
                    # else:
                    #     if last_term_line:
                    #         output_file.write(last_term_line)
                    #         last_term_line = None
                    #     state = 0
                    #     output_file.write(encoded_line)

    def infere_terms_occurences(self):
        """ Labels all occurences of terms in morfolgized vertical

        During terms-inference some postprocessing is done as well
        (removing desamb hacks, using actual numbers as lemmata).
        """
        if self.language() != 'en':
            raise CorpusException('terms inference is currently supported only for English')

        vertical_path = self.get_vertical_path()

        try:
            logging.info('Terms occurences inference in {name} started'.format(
                name=self.get_corpus_name()))

            output_path = vertical_path + '.terms'
            #call(('cp', vertical_path, original_vertical_path))

            # find tagset (throws exception if registry file not found)
            #tagset = self.get_tagset()
            tagset = TAGSETS.TREETAGGER

            with open(vertical_path) as input_file:
                with open(output_path, 'w') as output_file:
                    for line in input_file:
                        line = line.decode('utf-8').strip()
                        # TODO: ?osetrit prazdne radky a podobne veci??
                        if line.startswith('<doc'):
                            document = [line]
                        else:
                            document.append(line)
                        # check if the end of document is reached
                        if line == '</doc>':
                            vertical = VerticalDocument(document,
                                tagset=tagset,
                                terms_inference=True)
                            output_file.write(str(vertical))

            logging.info('Terms occurences inference in {name} finished.'
                .format(name=self.get_corpus_name()))

        except CorpusException as exc:
            raise CorpusException('Terms inference failed: ' + exc.message)

        except RegistryException as exc:
            raise CorpusException('Terms inference failed: ' + exc.message)

        except LanguageProcessorException as exc:
            raise CorpusException('Terms inference failed: ' + exc.message)

    def create_registry(self):
        """ Creates registry file
        """
        store_registry(
            path=self.get_registry_path(),
            lang=self.language(),
            vertical_path=self.get_vertical_path(),
            compiled_path=self.get_compiled_corpus_path())

    def compile_corpus(self):
        """ Compiles given corpora
        """
        task = Popen(('compilecorp',
            '--recompile-corpus',
            self.get_registry_path(),
            self.get_vertical_path()))
        task.wait()
        if task.returncode != 0:
            raise CorpusException('Compilation failed.')
        logging.info('Corpus compiled:' + self.get_compiled_corpus_path())

    def check_corpus(self):
        """Prints compiled corpus status generated by corpcheck.
        """
        task = Popen(('corpcheck', self.get_registry_path()))
        task.wait()
        if task.returncode != 0:
            raise CorpusException('Compiled corpus checking failed.')

    def print_concordances(self, query):
        """Prints concordances of a given query

        @param query: query string in CQL
        """
        call(('corpquery', self.get_registry_path(), query,
            '-h', '10',     # limit of 10 results,
            '-c', '10',     # left and right context of 10 words
            '-a', 'word',   # only show words in the result
            '-s', 'p,doc'))  # only show p and doc structures

    def print_info(self):
        """ Returns corpus summary
        """
        # check if corpus exists
        verticals_path = self.get_uncompiled_corpus_path()
        if not os.listdir(verticals_path):
            print 'Corpus %s does not exist.' % self.get_corpus_name()
            return

        print 'Corpus name:', self.get_corpus_name()

        # verticals
        print 'Vertical files:', verticals_path
        call(('ls', '-lhtcr', verticals_path))

        # registry
        registry_path = self.get_registry_path()
        if os.path.isfile(registry_path):
            print 'Registry:', registry_path
        else:
            print 'Registry: no'
            # if there is no registry file, it can't be compiled
            return

        # compilation
        compiled_path = self.get_compiled_corpus_path()
        print 'Compiled:',
        if os.listdir(compiled_path):
            print compiled_path
        else:
            print 'no'

    # ------------------------------------------------------------------------
    #  private methods
    # ------------------------------------------------------------------------

    @contextmanager
    def _open_dump(self):
        """Opened dump (prepared for reading) with statement manager

        Allows to write:
            with self._open_dump() as dump_file:
                do something
        And dump will be closed automatically no matter what.
        """
        dump_path = self.get_dump_path()
        try:
            # open dump
            if self.is_dump_compressed():
                dump_file = bz2.BZ2File(dump_path, 'r')
            else:
                dump_file = open(dump_path)
            try:
                yield dump_file
                # [after yield, the body of with statement will be executed]
            finally:
                dump_file.close()
        except IOError as exc:
            # errno.ENOENT = "No such file or directory"
            if exc.errno == errno.ENOENT:
                raise CorpusException('Dump file {name} doesn\'t exist.'
                    .format(name=dump_path))

    # ------------------------------------------------------------------------
    #  magic methods
    # ------------------------------------------------------------------------

    def __str__(self):
        return unicode(self).encode('utf-8')

    def __repr__(self):
        return 'WikiCorpus({lang})'.format(lang=self.language())

    def __unicode__(self):
        return repr(self)