def process_batches_for_lemma(): for i in range(0, 19): t = time() print(saved_data_file_tokens_entities_tags.format(i)) data = get_pickled(saved_data_file_tokens_entities_tags.format(i)) lemma_map = get_lemma_map(data) save_to_file("lemma_map-{}".format(i), lemma_map)
def process_batches(_i): print("Now processing test_tuples_chunk_{}".format(_i)) _test_tuples = get_pickled("chunks/test_tuples_chunk_{}".format(_i)) _ply = len(_test_tuples) print("#test_tuples_chunk: {}".format(_ply)) t = time() _found, _not_found, _errors = validate(_test_tuples, entity_tuples, pl_map, prefix_map, lemma_map, _ply) print('Time to process test_tuples-{} : {} mins'.format( _i, round((time() - t) / 60, 2))) save_to_file("results_chunks-{}".format(_i), (_found, _not_found, _errors))
def process_test_tuples(_i): print("Now processing file: {}".format(saved_data_file.format(_i))) _data = get_pickled(saved_data_file.format(_i)) _test_tuples = get_test_data(_data) save_to_file("test_tuples-{}".format(_i), _test_tuples)
save_to_file("lemma_map", lemma_map) def process_batches(sentences_out): for i in range(0, 19): data = get_pickled(saved_data_file_tokens_entities_tags.format(i)) data_map_of_sentences = map_docs_to_sentences(data) sentences_for_docs = get_list_sentences(data_map_of_sentences) sentences = flatten_list(sentences_for_docs) sentences_out.append(sentences) # page_object_map(pages_input_file, pages_output_file) # article_parent_object_map(article_parents_input_file, article_parents_output_file) # category_parent_object_map(category_parents_input_file, category_parents_output_file) # child_article_object_map(child_articles_input_file, child_articles_output_file) # child_category_object_map(child_categories_input_file, child_categories_output_file) # link_by_source_object_map(link_by_source_input_file, link_by_source_output_file) # saved_data_file = "1mln_tokens" # save_to_file(saved_data_file, data) process_batches_for_lemma() sentences_pred = [] process_batches(sentences_pred) sentences = flatten_list(sentences_pred) save_to_file("all-sentences", sentences)
_scrapped_not_found_errors.append((__t[0], _l)) print(_i) _i += 1 return _scrapped_not_found_errors # scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found") # i =1 not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-{}".format(i)) (_error_not_found_candidates, _found_candidates) = get_pickled("candidates") scrap_found_map = get_pickled("scrap_found_map") scrapped_not_found_errors = scrap_not_found2(_found_candidates) save_to_file("scrapped_not_found_errors", scrapped_not_found_errors) to_scrap = [] for _t in not_found_errors_list_of_tuples_chunk1: to_scrap.append((_t[0][0], _t[0][1], list(_t[1].keys()))) scrapped_not_found = scrap_not_found(to_scrap) save_to_file("scrapped_not_found-{}".format(i), scrapped_not_found) # _found, _not_found, _errors = get_pickled("chunks/bin/results_chunks-83") # lemma_map = get_pickled("lemma_map_ext") # (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") # stopwords = get_polish_stopwords() merged_map = get_pickled("merged_map_not_found_errors")
def list_to_map(_list): _map = {} for _tuple in _list: if _tuple[0] not in _map: _map[_tuple[0]] = _tuple[1] return _map stopwords = get_polish_stopwords() lemma_map = get_pickled("lemma_map_ext") scrapped_not_found_errors = get_pickled("scrapped_not_found_errors") _map_0 = list_to_map(scrapped_not_found_errors) _map = remove_disamb_pages(_map_0, lemma_map, stopwords) save_to_file("scrapped_not_found_errors_clean", _map) tuple_map0 = remove_disamb_pages(get_pickled("tuple_map_scrap-{}".format(2))) for i in range(3, 37): if i != 6: tuple_map1 = remove_disamb_pages( get_pickled("tuple_map_scrap-{}".format(i))) merge_tuple_map(tuple_map0, tuple_map1) save_to_file("wikidata_context_map", tuple_map0) # lemma_map = get_pickled("lemma_map-{}".format(0)) merged_map = {} not_found_list_of_tuples = [] # global_map1 = get_pickled("global_map.10") # global_map2 = get_pickled("global_map.13")
(mapped, not_mapped) = get_pickled("map_test_set_with_scrap_filled") import csv with open(dir + 'roziewski-poleval-task3.tsv', 'w') as tsvfile: writer = csv.writer(tsvfile, delimiter='=') for _t in mapped: writer.writerow([_t[0], _t[1]]) (mapped, not_mapped) = get_pickled("map_test_set_with_scrap") # (mapped, not_mapped) = get_pickled("map_test_set") clean_scrapped_not_found = get_pickled("clean_scrapped_not_found") scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found") scrap_found_map = map_scrap_found(scrap_mapped) save_to_file("scrap_found_map", scrap_found_map) # scrap_mapped, scrap_not_mapped = map_scrapped_not_found(clean_scrapped_not_found, w2vec) # save_to_file("scrap_found_and_not_found", (scrap_mapped, scrap_not_mapped)) i = 1 # clean_scrapped_not_found = [] # for _t in scrapped_not_found: # _clean_tuple = (_t[0], _t[1], clean_tuples(_t[2])) # clean_scrapped_not_found.append(_clean_tuple) # # save_to_file("clean_scrapped_not_found", clean_scrapped_not_found) # not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-5") # _to_scrap = []
cnt = 0 multi = int(sys.argv[1]) ply = 100000 # (text_mapping, __disambiguation) = get_pickled("data-scraped-{}".format(multi)) cnt = ply * (multi - 1) for json in type_jsons[(multi - 1) * ply:multi * ply]: cnt += 1 if json['wiki']['pl']: to_exclude = [' '] _entity = json['wiki']['pl'].lower().translate( {ord(i): '_' for i in to_exclude}) if json['wiki']['pl'] else False text = get_text(get_soup(_entity)) cnt += 1 print("Progress {}".format(cnt)) if cnt % ply / 10 == 0 else False if _entity in text_mapping.keys(): if _entity not in __disambiguation: __disambiguation[_entity] = [__disambiguation_helper[_entity]] del (__disambiguation_helper[_entity]) __disambiguation[_entity].append(text) else: text_mapping[_entity] = text __disambiguation_helper[_entity] = text if cnt % ply / 10 == 0: save_to_file("data-scraped-{}-{}".format(multi, cnt), (text_mapping, __disambiguation)) text_mapping = {}