def get_conf(): """TODO: Docstring for get_conf. :returns: TODO """ parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) parser.add_argument('-d', '--dict', required=True, help='Path to dictionary with phrases') parser.add_argument('-o', '--output', default='web', help='Mongo DB collection for saving corpora') parser.add_argument('--lang1', required=True, help='Language of first phrase in dictionary') parser.add_argument('--lang2', required=True, help='Language of second phrase in dictionary') parser.add_argument('-r', '--restart', action='store_true', help='Restart retrieving text') parser.add_argument('--min-phrase-probability', default=None, help='Minimal probability value for phrase pair') parser.add_argument('--results-for-phrase', default=None, help='Number of results to save for each phrase') parser.add_argument('-t', '--threads', default=5, type=int, help='Number of threads for downloading html from websites') parser.add_argument('--debug', action='store_true', help='Debug mode') conf = parser.parse_args() yaml_file = rel_path(__file__, 'conf', 'webcorpora.yaml') with open(yaml_file) as f: yaml_conf = yaml.safe_load(f) or {} for key, val in yaml_conf.items(): if not hasattr(conf, key) or getattr(conf, key) is None: setattr(conf, key, val) return conf
:phrase: TODO :returns: TODO """ log.debug('get_text_async') return list(get_text([phrase], lang1, lang2, n)) if __name__ == '__main__': conf = get_conf() if conf.debug: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger('requests').setLevel(logging.ERROR) logging.getLogger('urllib3').setLevel(logging.ERROR) state = shelve.open(rel_path(__file__, 'conf', 'state.db')) log.debug('State %s', state) log.debug('Conf %s', conf) state_id = os.path.abspath(conf.dict) skip = state.setdefault(state_id, 0) if conf.restart: skip = 0 state[state_id] = 0 pool = Pool(conf.threads, init_async, (conf.lang1, conf.lang2, conf.results_for_phrase, conf.google_delay*conf.threads, conf.google_big_delay)) data_iter = read_bidict(conf.dict, conf.min_phrase_probability, skip=skip) data_iter = progress_updater(state, state_id, data_iter, skip)