def main(): """Create a Whoosh index for a PO file. Given a PO file, enumerates all the strings, and creates a Whoosh index to be able to search later. """ print("Create Whoosh index from a PO file") print("Use --help for assistance") start_time = time.time() try: locale.setlocale(locale.LC_ALL, '') except Exception as detail: print("Exception: " + str(detail)) po_directory, debug_keyword, projects_names = read_parameters() indexCreator = IndexCreator(po_directory) indexCreator.debug_keyword = debug_keyword indexCreator.projects_names = projects_names indexCreator.create() indexCreator.process_projects() _write_statistics(indexCreator.projects, indexCreator.words) _write_select_projects(indexCreator.options) end_time = time.time() - start_time print("time used to create the index: " + str(end_time))
def main(): print("Create Whoosh index from a directory with JSONs") start_time = datetime.datetime.now() indexCreator = IndexCreator("data/jsons/") indexCreator.create() indexCreator.process_files() indexCreator.save_index() print( "Time used to create the index: {0} ".format(datetime.datetime.now() - start_time))
def main(): """Create a Whoosh index for a PO file. Given a PO file, enumerates all the strings, and creates a Whoosh index to be able to search later. """ print("Create Whoosh index from a PO file") print("Use --help for assistance") start_time = time.time() try: locale.setlocale(locale.LC_ALL, '') except Exception as detail: print("Exception: " + str(detail)) po_directory, debug_keyword, projects_names = read_parameters() indexCreator = IndexCreator(po_directory, debug_keyword, projects_names) indexCreator.create() indexCreator.process_projects() ctx = { 'date': datetime.date.today().strftime("%d/%m/%Y"), 'projects': str(indexCreator.projects), 'words': locale.format("%d", indexCreator.words, grouping=True), } process_template("templates/statistics.mustache", "statistics.html", ctx) ctx = { # This is the list of projects to display for the user to select. 'options': sorted(indexCreator.options, key=lambda x: x.lower()), } process_template("templates/select-projects.mustache", "select-projects.html", ctx) end_time = time.time() - start_time print("time used to create the index: " + str(end_time))
def _process_xml(): WIKIDICTIONARY = 2 en_labels = 0 ca_labels = 0 fr_labels = 0 de_labels = 0 es_labels = 0 it_labels = 0 ca_descs = 0 it_descs = 0 words = read_english_word_list() index = IndexCreator() index.open() authors = set() words_file_ca = open('words-ca.txt','w') descriptions_file_ca = open('descriptions-ca.txt','w') e = xml.etree.ElementTree.parse('cawiktionary-20160701-pages-meta-current.xml').getroot() for page in e.getchildren(): verb = False adverbi = False adjectiu = False en_label = u'' ca_label = u'' fr_label = u'' de_label = u'' es_label = u'' it_label = u'' for page_element in page.getchildren(): if 'title' in page_element.tag: ca_label = unicode(page_element.text) if 'revision' in page_element.tag: text = _get_revision_text(page_element) username = _get_username(page_element) if username is not None and len(username) > 0: authors.add(username) if text is not None: if '{{ca-verb' in text: verb = True elif '{{lema|ca|adv}}' in text: adverbi = True elif '{{ca-adj' in text: adjectiu = True if verb is True or adverbi is True or adjectiu is True: en_label = _get_translation(text, '{{trad|en|') es_label = _get_translation(text, '{{trad|es|') fr_label = _get_translation(text, '{{trad|fr|') de_label = _get_translation(text, '{{trad|de|') it_label = _get_translation(text, '{{trad|it|') username = _get_username(page_element) if username is not None and len(username) > 0: authors.add(username) if verb is False and adverbi is False and adjectiu is False: continue if ca_label.lower().strip() not in words: logging.debug("Discard not in word list: " + ca_label) continue if term_exists_in_index(index, ca_label, en_label): logging.debug("Discard already existing word in index: " + ca_label) continue # TODO: A better way to determine infinitives ca_label_str = to_str(ca_label) if verb is True and ca_label_str[len(ca_label_str) - 1] != 'r': logging.debug("Discard verb is not infinitive: " + ca_label) continue ca_desc = u'' textExtract = TextExtract(text) s = textExtract.GetDescription() if len(en_label) == 0 and len(es_label) == 0 and len(fr_label) == 0 and \ len(de_label) == 0 and len(it_label) == 0: logging.debug("Discard only ca_label:" + ca_label) continue if len(s) > 0: ca_desc = s ca_descs += 1 ca_labels += 1 if len(en_label) > 0: en_labels += 1 if len(es_label) > 0: es_labels += 1 if len(fr_label) > 0: fr_labels += 1 if len(de_label) > 0: de_labels += 1 if len(it_label) > 0: it_labels += 1 words_file_ca.write(ca_label.encode('utf-8') + '\r\n') if ca_desc is not None: s = '{0} - {1}\r\n'.format(ca_label.encode('utf-8'), ca_desc.encode('utf-8')) descriptions_file_ca.write(s) index.write_entry(word_en=en_label, word_ca=ca_label, word_fr=fr_label, word_de=de_label, word_es=es_label, word_it=it_label, definition_en=None, definition_ca=ca_desc, definition_fr=None, definition_de=None, definition_es=None, definition_it=None, image=None, permission=None, gec=None, wikidata_id=None, wikiquote_ca=None, wikidictionary_ca=ca_label, source=WIKIDICTIONARY) stats = { "ca_labels": ca_labels, "ca_descs": ca_descs, "en_labels": en_labels, "fr_labels": fr_labels, "de_labels": de_labels, "en_labels": en_labels, "es_labels": es_labels, "it_labels": it_labels } _show_statistics(stats) _save_statistics(stats) index.save() words_file_ca.close() descriptions_file_ca.close()