def main():
    """Create a Whoosh index for a PO file.

    Given a PO file, enumerates all the strings, and creates a Whoosh index to
    be able to search later.
    """
    print("Create Whoosh index from a PO file")
    print("Use --help for assistance")

    start_time = time.time()

    try:
        locale.setlocale(locale.LC_ALL, '')
    except Exception as detail:
        print("Exception: " + str(detail))

    po_directory, debug_keyword, projects_names = read_parameters()
    indexCreator = IndexCreator(po_directory)
    indexCreator.debug_keyword = debug_keyword
    indexCreator.projects_names = projects_names
    indexCreator.create()
    indexCreator.process_projects()

    _write_statistics(indexCreator.projects, indexCreator.words)
    _write_select_projects(indexCreator.options)

    end_time = time.time() - start_time
    print("time used to create the index: " + str(end_time))
Exemplo n.º 2
0
def main():
    print("Create Whoosh index from a directory with JSONs")

    start_time = datetime.datetime.now()

    indexCreator = IndexCreator("data/jsons/")
    indexCreator.create()
    indexCreator.process_files()
    indexCreator.save_index()

    print(
        "Time used to create the index: {0} ".format(datetime.datetime.now() -
                                                     start_time))
Exemplo n.º 3
0
def main():
    """Create a Whoosh index for a PO file.

    Given a PO file, enumerates all the strings, and creates a Whoosh index to
    be able to search later.
    """
    print("Create Whoosh index from a PO file")
    print("Use --help for assistance")

    start_time = time.time()

    try:
        locale.setlocale(locale.LC_ALL, '')
    except Exception as detail:
        print("Exception: " + str(detail))

    po_directory, debug_keyword, projects_names = read_parameters()
    indexCreator = IndexCreator(po_directory, debug_keyword, projects_names)
    indexCreator.create()
    indexCreator.process_projects()

    ctx = {
        'date': datetime.date.today().strftime("%d/%m/%Y"),
        'projects': str(indexCreator.projects),
        'words': locale.format("%d", indexCreator.words, grouping=True),
    }
    process_template("templates/statistics.mustache", "statistics.html", ctx)

    ctx = {
        # This is the list of projects to display for the user to select.
        'options': sorted(indexCreator.options, key=lambda x: x.lower()),
    }
    process_template("templates/select-projects.mustache",
                     "select-projects.html", ctx)

    end_time = time.time() - start_time
    print("time used to create the index: " + str(end_time))
def _process_xml():
 
    WIKIDICTIONARY = 2
    en_labels = 0
    ca_labels = 0
    fr_labels = 0
    de_labels = 0
    es_labels = 0
    it_labels = 0
    ca_descs = 0
    it_descs = 0
    words = read_english_word_list()

    index = IndexCreator()
    index.open()
    authors = set()
    words_file_ca = open('words-ca.txt','w')
    descriptions_file_ca = open('descriptions-ca.txt','w')
 
    e = xml.etree.ElementTree.parse('cawiktionary-20160701-pages-meta-current.xml').getroot()
    for page in e.getchildren():
        verb = False
        adverbi = False
        adjectiu = False
        en_label = u''
        ca_label = u''
        fr_label = u''
        de_label = u''
        es_label = u''
        it_label = u''

        for page_element in page.getchildren():
            if 'title' in page_element.tag:
                ca_label = unicode(page_element.text)

            if 'revision' in page_element.tag:
                text = _get_revision_text(page_element)
                username = _get_username(page_element)
                if username is not None and len(username) > 0:
                    authors.add(username)

                if text is not None:
                    if '{{ca-verb' in text:
                        verb = True
                    elif '{{lema|ca|adv}}' in text:
                        adverbi = True
                    elif '{{ca-adj' in text:
                        adjectiu = True

                    if verb is True or adverbi is True or adjectiu is True:
                        en_label = _get_translation(text, '{{trad|en|')
                        es_label = _get_translation(text, '{{trad|es|')
                        fr_label = _get_translation(text, '{{trad|fr|')
                        de_label = _get_translation(text, '{{trad|de|')
                        it_label = _get_translation(text, '{{trad|it|')

                        username = _get_username(page_element)
                        if username is not None and len(username) > 0:
                            authors.add(username)
        
        if verb is False and adverbi is False and adjectiu is False:
            continue

        if ca_label.lower().strip() not in words:
            logging.debug("Discard not in word list: " + ca_label)
            continue

        if term_exists_in_index(index, ca_label, en_label):
            logging.debug("Discard already existing word in index: " + ca_label)
            continue

        # TODO: A better way to determine infinitives
        ca_label_str = to_str(ca_label)
        if verb is True and ca_label_str[len(ca_label_str) - 1] != 'r':
            logging.debug("Discard verb is not infinitive: " + ca_label)
            continue

        ca_desc = u''
        textExtract = TextExtract(text)
        s = textExtract.GetDescription()

        if len(en_label) == 0 and len(es_label) == 0 and len(fr_label) == 0 and \
           len(de_label) == 0 and len(it_label) == 0:
            logging.debug("Discard only ca_label:" + ca_label)
            continue

        if len(s) > 0:
            ca_desc = s
            ca_descs += 1

        ca_labels += 1
        if len(en_label) > 0:
            en_labels += 1

        if len(es_label) > 0:
            es_labels += 1

        if len(fr_label) > 0:
            fr_labels += 1

        if len(de_label) > 0:
            de_labels += 1

        if len(it_label) > 0:
            it_labels += 1

        words_file_ca.write(ca_label.encode('utf-8') + '\r\n')

        if ca_desc is not None:
            s = '{0} - {1}\r\n'.format(ca_label.encode('utf-8'), ca_desc.encode('utf-8'))
            descriptions_file_ca.write(s)
   
        index.write_entry(word_en=en_label,
                          word_ca=ca_label,
                          word_fr=fr_label,
                          word_de=de_label,
                          word_es=es_label,
                          word_it=it_label,
                          definition_en=None,
                          definition_ca=ca_desc,
                          definition_fr=None,
                          definition_de=None,
                          definition_es=None,
                          definition_it=None,
                          image=None,
                          permission=None,
                          gec=None,
                          wikidata_id=None,
                          wikiquote_ca=None,
                          wikidictionary_ca=ca_label,
                          source=WIKIDICTIONARY)

    stats = {
             "ca_labels": ca_labels,
             "ca_descs": ca_descs,
             "en_labels": en_labels,
             "fr_labels": fr_labels,
             "de_labels": de_labels,
             "en_labels": en_labels,
             "es_labels": es_labels,
             "it_labels": it_labels
        }

    _show_statistics(stats)
    _save_statistics(stats)
    index.save()
    words_file_ca.close()
    descriptions_file_ca.close()