def download(dataset='uea'): """ Downloads the uea data to '/raw/uea'. """ raw_dir = DATA_DIR + '/raw' assert os.path.isdir(raw_dir), "No directory exists at data/raw. Please make one to continue." if dataset == 'uea': url = 'http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_arff.zip' save_dir = DATA_DIR + '/raw/UEA' zipname = save_dir + '/uea.zip' elif dataset == 'ucr': url = 'http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip' save_dir = DATA_DIR + '/raw/UCR' zipname = save_dir + '/ucr.zip' elif dataset == 'tsr': url = 'https://zenodo.org/record/3902651/files/Monash_UEA_UCR_Regression_Archive.zip?download=1' save_dir = DATA_DIR + '/raw/TSR' zipname = save_dir + '/tsr.zip' else: raise ValueError('Can only download uea, ucr or tsr. Was asked for {}.'.format(dataset)) if os.path.exists(save_dir): print('Path already exists at {}. If you wish to re-download you must delete this folder.'.format(save_dir)) return mkdir_if_not_exists(save_dir) if len(os.listdir(save_dir)) == 0: download_url(url, zipname) unzip(zipname, save_dir)
def main(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="ptwiki-compressed-text-folder") parser.add_argument("-o", "--output", default="./data/cleaned/", help="directory for extracted files") args = parser.parse_args() input_dirname = args.input output_dirname = args.output mkdir_if_not_exists(output_dirname) vocab, tokens = set(), 0 output = OutputSplitter(NextFile(output_dirname), 10 * 1024 * 1024, True) num_threds = multiprocessing.cpu_count() pool = ThreadPool(num_threds) print('Running with {0} threads ...'.format(num_threds)) job_batch_size = 1000 reporter = JobsReporter(report_period=1000) documents = read_wiki_documents_compressed(input_dirname) jobs = grouper(documents, job_batch_size) for job in pool.imap(worker_clean_document, jobs): for sentences in job: for sentence in sentences: output.write((sentence + '\n').encode('utf-8')) tokens += sentence.count(' ') + 1 for w in sentence.split(): vocab.add(w) reporter.complete_job(report=True) output.close() print('\n') print('Tokens: ', tokens) print('Vocabulary: ', len(vocab))
def main(): logging.basicConfig(format='%(levelname)s: %(message)s') logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="deps_context_path") parser.add_argument("-o", "--output", default="./data/contexts/", help="directory for extracted files") parser.add_argument( "-m", "--model", default="./data/models/word2vec/word2vec-s400-w5-m5.bin", help="word2vec model to extract vocab") args = parser.parse_args() output_dirname = args.output mkdir_if_not_exists(output_dirname) deps_context_path = args.input word_vectors = KeyedVectors.load_word2vec_format(args.model, binary=True) vocab = set(word_vectors.vocab) logging.info('Vocab:\t%d', len(vocab)) extract_start = time.perf_counter() logging.info("Processing wv ...") filter_file(deps_context_path, output_dirname, 'wv', vocab) logging.info("Processing cv ...") filter_file(deps_context_path, output_dirname, 'cv', vocab) logging.info("Processing dep.contexts ...") filter_file(deps_context_path, output_dirname, 'dep.contexts', vocab) extract_duration = time.perf_counter() - extract_start logging.info("elapsed %f", extract_duration)
def main(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="sqlfile_path") default_process_count = max(1, cpu_count() - 1) parser.add_argument( "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") parser.add_argument( "-b", "--batchsize", type=int, default=50, help= "The number of sentences to be sended to the parser in each iteration." ) parser.add_argument("-o", "--output", default="./data/contexts/", help="directory for extracted files") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("--debug", action="store_true", help="print debug info") args = parser.parse_args() output_dirname = args.output mkdir_if_not_exists(output_dirname) sqlfile_path = args.input job_batch_size = args.batchsize FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) options = {} options['quiet'] = args.quiet options['debug'] = args.debug options['sqlfile_path'] = sqlfile_path options['job_batch_size'] = job_batch_size options['output_dirname'] = output_dirname createLogger(options['quiet'], options['debug']) number_of_pages = get_page_count(sqlfile_path, job_batch_size) jobs = [(pageNum, ) for pageNum in range(1, number_of_pages)] # process pages logging.info("Starting") extract_start = time.perf_counter() process_count = args.processes process_count = max(1, process_count) maxsize = 10 * process_count # output queue output_queue = Queue(maxsize=maxsize) worker_count = process_count # load balancing max_spool_length = 10000 spool_length = Value('i', 0, lock=False) # reduce job that sorts and prints output reduce = Process(target=reduce_process, args=(options, output_queue, spool_length)) reduce.start() # initialize jobs queue jobs_queue = Queue(maxsize=maxsize) # start worker processes logging.info("Using %d processes.", worker_count) workers = [] for i in range(worker_count): extractor = Process(target=extract_process, args=(options, i, jobs_queue, output_queue)) extractor.daemon = True # only live while parent process lives extractor.start() workers.append(extractor) # Mapper process page_num = 0 for page_data in jobs: pageNum, = page_data # slow down delay = 0 if spool_length.value > max_spool_length: # reduce to 10% while spool_length.value > max_spool_length / 10: time.sleep(10) delay += 10 if delay: logging.info('Delay %ds', delay) job = (pageNum, ) jobs_queue.put(job) # goes to any available extract_process page_num += 1 # signal termination for _ in workers: jobs_queue.put(None) # wait for workers to terminate for w in workers: w.join() # signal end of work to reduce process output_queue.put(None) # wait for it to finish reduce.join() extract_duration = time.perf_counter() - extract_start extract_rate = (page_num * job_batch_size) / extract_duration logging.info( "Finished %d-process of %d sentences in %.1fs (%.1f sentences/s)", process_count, page_num * job_batch_size, extract_duration, extract_rate)
def main(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="ptwiki-compressed-text-folder") parser.add_argument("-o", "--output", default="./models/", help="directory for extracted files") parser.add_argument("-s", "--size", type=int, default=200, help="size") parser.add_argument("-w", "--window", type=int, default=5, help="window") parser.add_argument("-m", "--mincount", type=int, default=2, help="mincount") parser.add_argument("-sg", "--sg", type=int, default=0, help="use skip-gram") args = parser.parse_args() output_dirname = args.output input_dir = args.input size = args.size sg = args.sg window = args.window min_count = args.mincount output_path = args.output mkdir_if_not_exists(output_dirname) # '../data/ptwiki-articles-text-preprocessed wiki_text_dump_path = input_dir + '/**/*.bz2' sentences = MySentences(wiki_text_dump_path) mkdir_if_not_exists(output_path) # build vocabulary and train model model = gensim.models.Word2Vec(sentences, size=size, window=window, min_count=min_count, sg=sg, workers=multiprocessing.cpu_count()) # model.train(documents, total_examples=len(documents), epochs=10) # trim unneeded model memory = use (much) less RAM # Precompute L2-normalized vectors. # If replace is set, forget the original vectors and only keep the normalized ones = saves lots of memory! # Note that you cannot continue training after doing a replace. The model becomes effectively read-only = you can call most_similar, similarity etc., but not train. model.init_sims(replace=True) model_file_name = os.path.join( output_path, 'word2vec-s{0}-w{1}-m{2}-sg{3}'.format(size, window, min_count, sg)) # model.save(model_file_name) model.wv.save_word2vec_format('{0}.bin'.format(model_file_name), binary=True) word_vectors = model.wv # word_vectors.save(output_path) print(word_vectors.most_similar(positive=['carro'], topn=10)) print("Most similar to {0}".format( word_vectors.most_similar(positive="america"))) print('CONCLUDED')
def main(): parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="sqlfile_path") parser.add_argument( "-b", "--batchsize", type=int, default=50, help= "The number of sentences to be sended to the parser in each iteration." ) parser.add_argument("-mc", "--mincount", type=int, help="ignores all contexts that apears less then") parser.add_argument( "-wv", "--wordvocabcount", type=int, help= "generates the word vocab and ignores all words that apears less then") parser.add_argument("-o", "--output", default="./data/contexts/", help="directory for extracted files") args = parser.parse_args() sqlfile_path = args.input job_batch_size = args.batchsize output_dirname = args.output wordvocabcount = args.wordvocabcount mincount = args.mincount mkdir_if_not_exists(output_dirname) word_vocab_file = os.path.join(output_dirname, 'wordvocabcount') cv_all = Counter() wv_all = Counter() word_filter = None output = None if wordvocabcount == None: with open(word_vocab_file, 'rb') as f: word_filter = wv_all = pickle.load(f) output = open(os.path.join(output_dirname, 'dep.contexts'), 'wb') else: mincount = None num_threds = multiprocessing.cpu_count() pool = multiprocessing.pool.ThreadPool(num_threds) print('Running with {0} threads ...'.format(num_threds)) print('Batch size: {0}'.format(job_batch_size)) reporter = JobsReporter(batch_size=job_batch_size, report_period=10) with sqlite3.connect(sqlfile_path) as conn: c = conn.cursor() c.execute('SELECT COUNT(*) FROM sentences where palavras IS NOT NULL') (total, ) = c.fetchone() jobs_number = math.ceil(total / job_batch_size) jobs = range(1, jobs_number) print('Sentences to be parsed: {0}'.format(total)) reporter.reset() for (cv, wv, batch_result) in pool.imap( create_worker_method(sqlfile_path, job_batch_size, word_filter), jobs): cv_all = cv_all + cv if wordvocabcount != None: wv_all = wv_all + wv else: output.write('\n'.join(batch_result).encode('utf-8')) reporter.complete_job(report=True) print('\n') if wordvocabcount != None: for key, count in dropwhile( lambda key_count: key_count[1] >= wordvocabcount, wv_all.most_common()): del wv_all[key] with open(word_vocab_file, 'wb') as f: pickle.dump(wv_all, f) with open(os.path.join(output_dirname, 'wv'), encoding='utf-8', mode='w') as f: for w, count in wv_all.items(): f.write('{} {}\n'.format(w, count)) wv_all = None if wordvocabcount == None: for key, count in dropwhile(lambda key_count: key_count[1] >= mincount, cv_all.most_common()): del cv_all[key] with open(os.path.join(output_dirname, 'cv'), encoding='utf-8', mode='w') as f: for w, count in cv_all.items(): f.write('{} {}\n'.format(w, count)) cv_all = None