示例#1
0
def main(args):
    input_folder = os.path.join(args.build, 'preprocessed_documents')
    output_folder = os.path.join(args.build, 'raw_normalized_documents')
    config = None
    try:
        with open('config/config.json') as data:
            config = json.load(data)
            # Basic config validation
            if 'ngrams' not in config:
                raise Exception(
                    'Section "ngrams" missing from configuration file')
            else:
                for k in copy.deepcopy(config['ngrams']):
                    config['ngrams'][int(k)] = config['ngrams'][k]
                    del config['ngrams'][k]
    except Exception as e:
        print('Cannot load configuration file. Details: {}'.format(e))
        exit(5)

    if not args.u:
        try:
            if args.f:
                shutil.rmtree(output_folder)
            os.mkdir(output_folder)
        except Exception as e:
            print(e)

    update = args.u
    files = sorted([
        os.path.join(input_folder, f) for f in listdir(input_folder)
        if isfile(join(input_folder, f)) if '_text_without_conclusion.txt' in f
    ])
    raw_corpus = []
    corpus_id = []
    print('# Load documents')
    for i, p in enumerate(files):
        try:
            sys.stdout.write('\r - Load document {}/{}'.format(
                i + 1, len(files)))
            doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0]
            raw_corpus.append(load_text_file(p))
            corpus_id.append(doc_id)
        except Exception as e:
            print(p, e)

    normalized_tokens = []
    print('\n# Compute tokens')
    try:
        for i, doc in enumerate(raw_corpus):
            filename = os.path.join(output_folder,
                                    '{}_normalized.txt'.format(corpus_id[i]))
            sys.stdout.write('\r - Normalize document {}/{}'.format(
                i, len(raw_corpus)))
            if not update or not os.path.isfile(filename):
                normalized_tokens.append(
                    normalized_step(doc, force=args.f, lemmatization=True))
            else:
                with open(filename, 'r') as f:
                    normalized_tokens.extend(f.read().split())
                    f.close()
    except Exception as e:
        print('\t -> Could not normalized the tokens. Details: {}'.format(e))
        exit(40)

    print('\n# Generate ngrams from tokens')
    all_grams = []
    doc_grammed = []
    try:
        for i, doc in enumerate(normalized_tokens):
            filename = os.path.join(output_folder,
                                    '{}_normalized.txt'.format(corpus_id[i]))
            sys.stdout.write('\r - Calculate ngrams for document {}/{}'.format(
                i, len(raw_corpus)))
            if not update or not os.path.isfile(filename):
                grams = ngram_step(doc, config['ngrams'], force=args.f)
                merged = []
                for g in grams.values():
                    merged.extend(g)
                doc_grammed.append(merged)
                all_grams.extend(merged)
            else:
                print('\t -> Load document as already normalized.')
                with open(filename, 'r') as f:
                    all_grams.extend(f.read().split())
                    doc_grammed.append(None)
                    f.close()
    except Exception as e:
        print(e)
    print('')

    f = Counter(all_grams)
    print('# Save the full dictionary')
    with open(os.path.join(output_folder, 'full_dictionary.txt'),
              'w') as outfile:
        json.dump(f, outfile, indent=4, sort_keys=True)

    print('# Save normalized documents')
    for i, doc in enumerate(doc_grammed):
        if doc is not None:
            sys.stdout.write('\r - Save document {}/{}: {}'.format(
                i + 1, len(doc_grammed), corpus_id[i]))
            with open(
                    os.path.join(output_folder,
                                 '{}_normalized.txt'.format(corpus_id[i])),
                    'a') as file:
                file.write(' '.join(doc))
    print('')
def main(args):
    input_file = os.path.join(
        args.build,
        'cases_info/raw_cases_info.json' if args.processed_folder == 'all' else
        'cases_info/raw_cases_info_{}.json'.format(args.processed_folder))
    input_folder = os.path.join(args.build, 'raw_normalized_documents')
    output_folder = os.path.join(args.build, 'processed_documents',
                                 args.processed_folder)
    print('# Read configuration')
    config = None
    try:
        with open(CONFIG_FILE) as data:
            config = json.load(data)
            # Basic config validation
            if 'ngrams' not in config:
                raise Exception(
                    'Section "ngrams" missing from configuration file')
            else:
                for k in copy.deepcopy(config['ngrams']):
                    config['ngrams'][int(k)] = config['ngrams'][k]
                    del config['ngrams'][k]
    except Exception as e:
        print('Cannot load configuration file. Details: {}'.format(e))
        exit(5)

    cases_index = {}
    with open(input_file, 'r') as f:
        content = f.read()
        cases = json.loads(content)
        cases_index = {c['itemid']: i for i, c in enumerate(cases)}
        f.close()

    if not args.u:
        try:
            if args.f:
                shutil.rmtree(output_folder)
        except Exception as e:
            print(e)

        try:
            os.makedirs(output_folder)
        except Exception as e:
            print(e)

    update = args.u
    files = [os.path.join(input_folder, f) for f in listdir(input_folder) \
        if isfile(join(input_folder, f)) if '_normalized.txt' in f \
        and f.split('/')[-1].split('_normalized.txt')[0] in cases_index.keys()]
    raw_corpus = []
    corpus_id = []
    print('# Load documents')
    for i, p in enumerate(files):
        try:
            sys.stdout.write('\r - Load document {}/{}'.format(
                i + 1, len(files)))
            doc_id = p.split('/')[-1].split('_normalized.txt')[0]
            raw_corpus.append(load_text_file(p).split())
            corpus_id.append(doc_id)
        except Exception as e:
            print(p, e)
    print('')
    #data = json.load(open('./full_dictionary.txt'))
    f = [t for doc in raw_corpus for t in doc]
    f = Counter(f)
    # Load the raw dictionnary
    f = f.most_common(args.limit_tokens)
    words = [w[0] for w in f]
    #print(words)
    #print(len(doc_grammed[0]), len(doc_grammed[1]))
    #print(len(all_grams), len(f))

    #dictionary = corpora.Dictionary([all_grams])
    print('# Create dictionary')
    dictionary = corpora.Dictionary([words])
    dictionary.save(os.path.join(output_folder, 'dictionary.dict'))
    with open(os.path.join(output_folder, 'feature_to_id.dict'),
              'w') as outfile:
        json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True)
    #print(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in raw_corpus]
    print('# Create Bag of Words')
    for i, doc in enumerate(corpus):
        filename = os.path.join(output_folder,
                                '{}_bow.txt'.format(corpus_id[i]))
        #if update and not os.path.isfile(filename):
        with open(filename, 'w') as file:
            for f, v in doc:
                file.write('{}:{} '.format(f, v))

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    print('# Create TFIDF')
    for i, doc in enumerate(corpus_tfidf):
        with open(
                os.path.join(output_folder,
                             '{}_tfidf.txt'.format(corpus_id[i])),
                'w') as file:
            for f, v in doc:
                file.write('{}:{} '.format(f, v))
示例#3
0
def run(console,
        build,
        title,
        limit_tokens,
        doc_ids=None,
        processed_folder='all',
        force=False,
        update=False):
    __console = console
    global print
    print = __console.print

    input_file = os.path.join(
        build, 'raw', 'cases_info',
        'raw_cases_info_{}.json'.format(processed_folder))
    input_folder = os.path.join(build, 'raw', 'normalized_documents')
    output_folder = os.path.join(build, 'structured')
    output_folder_tfidf = os.path.join(output_folder, 'tfidf')
    output_folder_bow = os.path.join(output_folder, 'bow')

    print(Markdown("- **Step configuration**"))
    print(TAB + '> Step folder: {}'.format(output_folder_tfidf))
    make_build_folder(console, output_folder_tfidf, force, strict=False)
    print(TAB + '> Step folder: {}'.format(output_folder_bow))
    make_build_folder(console, output_folder_bow, force, strict=False)

    try:
        config()['steps']['normalize']['ngrams']
    except Exception as e:
        print('Cannot retrieve n-grams configuration. Details: {}'.format(e))
        exit(5)
    print(TAB + '> Read configuration [green][DONE]')

    cases_index = {}
    with open(input_file, 'r') as f:
        content = f.read()
        cases = json.loads(content)
        cases_index = {c['itemid']: i for i, c in enumerate(cases)}
        f.close()

    files = get_files(doc_ids, input_folder, cases_index)

    raw_corpus = []
    corpus_id = []
    print(Markdown('- **Create dictionary**'))
    with Progress(
            TAB + "> Loading in memory... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task(
            "Loading...",
            total=len(files),
            error="",
            doc=files[0].split('/')[-1].split('_normalized.txt')[0])
        for i, p in enumerate(files):
            error = ""
            try:
                doc_id = p.split('/')[-1].split('_normalized.txt')[0]
                raw_corpus.append(load_text_file(p).split())
                corpus_id.append(doc_id)
            except Exception as e:
                error = '\n| {}'.format('Could not load the document')
                log.debug(p, e)
            progress.update(task, advance=1, error=error, doc=doc_id)
    print(TAB + "> Loading in memory... [green][DONE]")

    # data = json.load(open('./full_dictionary.txt'))
    f = [t for doc in raw_corpus for t in doc]
    f = Counter(f)
    # Load the raw dictionary
    f = f.most_common(int(limit_tokens))
    words = [w[0] for w in f]

    # dictionary = corpora.Dictionary([all_grams])
    print(TAB + '> Create dictionary')
    dictionary = corpora.Dictionary([words])
    dictionary.save(os.path.join(output_folder, 'dictionary.dict'))
    with open(os.path.join(output_folder, 'feature_to_id.dict'),
              'w') as outfile:
        json.dump(dictionary.token2id, outfile, indent=4, sort_keys=True)
    corpus = [dictionary.doc2bow(text) for text in raw_corpus]
    print(Markdown('- **Create language models**'))
    with Progress(
            TAB + "> Create Bag of Word... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task("Loading...",
                                 total=len(corpus),
                                 error="",
                                 doc=corpus_id[0])
        for i, doc in enumerate(corpus):
            error = ""
            filename = os.path.join(output_folder_bow,
                                    '{}_bow.txt'.format(corpus_id[i]))
            # if update and not os.path.isfile(filename):
            with open(filename, 'w') as file:
                for f, v in doc:
                    file.write('{}:{} '.format(f, v))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + "> Create Bag of Word... [green][DONE]")

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    with Progress(
            TAB + "> Create TF-IDF... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console) as progress:
        task = progress.add_task("Loading...",
                                 total=len(corpus_tfidf),
                                 error="",
                                 doc=corpus_id[0])
        for i, doc in enumerate(corpus_tfidf):
            error = ""
            with open(
                    os.path.join(output_folder_tfidf,
                                 '{}_tfidf.txt'.format(corpus_id[i])),
                    'w') as file:
                for f, v in doc:
                    file.write('{}:{} '.format(f, v))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + "> Create TF-IDF... [green][DONE]")
示例#4
0
def run(console, build, title, doc_ids=None, force=False, update=False):
    __console = console
    global print
    print = __console.print

    print(Markdown("- **Step configuration**"))
    input_folder = os.path.join(build, 'raw', 'preprocessed_documents')
    output_folder = os.path.join(build, 'raw', 'normalized_documents')
    ngrams_config = {}
    try:
        ngrams_config = config()['steps']['normalize']['ngrams']
    except Exception as e:
        print('Cannot retrieve n-grams configuration. Details: {}'.format(e))
        exit(5)

    print(TAB + '> Step folder: {}'.format(output_folder))
    make_build_folder(console, output_folder, force, strict=False)

    files = get_files(doc_ids, input_folder)

    raw_corpus = []
    corpus_id = []
    print(Markdown('- **Load documents**'))
    with Progress(
            TAB + "> Loading in memory... [IN PROGRESS]",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console
    ) as progress:
        task = progress.add_task("Loading...", total=len(files), error="",
                                 doc=files[0].split('/')[-1].split('_text_without_conclusion.txt')[0])
        for i, p in enumerate(files):
            error = ""
            doc_id = p.split('/')[-1].split('_text_without_conclusion.txt')[0]
            try:
                raw_corpus.append(load_text_file(p))
                corpus_id.append(doc_id)
            except Exception as e:
                error = '\n| {}'.format('Could not load the document')
                log.debug(p, e)
            progress.update(task, advance=1, error=error, doc=doc_id)
    print(TAB + "> Loading in memory... [green][DONE]")

    normalized_tokens = []
    print(Markdown('- **Generate language model**'))
    try:
        with Progress(
                TAB + "> Normalize... [IN PROGRESS]\n",
                BarColumn(30),
                TimeRemainingColumn(),
                "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
                "{task.fields[error]}",
                transient=True,
                console=console
        ) as progress:
            task = progress.add_task("Compute tokens...", total=len(raw_corpus), error="", doc=corpus_id[0])
            for i, doc in enumerate(raw_corpus):
                filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i]))
                if not update or not os.path.isfile(filename):
                    normalized_tokens.append(normalized_step(doc, force=force, lemmatization=True))
                else:
                    with open(filename, 'r') as f:
                        normalized_tokens.extend(f.read().split())
                        f.close()
                progress.update(task, advance=1, error=error, doc=corpus_id[i])
    except Exception as e:
        print(TAB + '[bold red]:double_exclamation_mark: Could not normalized the tokens. Details: {}'.format(e))
        exit(40)
    print(TAB + "> Normalize... [green][DONE]")

    all_grams = []
    doc_grammed = []
    try:
        with Progress(
                TAB + "> Compute ngrams... [IN PROGRESS]\n",
                BarColumn(30),
                TimeRemainingColumn(),
                "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
                "{task.fields[error]}",
                transient=True,
                console=console
        ) as progress:
            task = progress.add_task("Compute tokens...", total=len(corpus_id), error="", doc=corpus_id[0])
            for i, doc in enumerate(normalized_tokens):
                error = ""
                filename = os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i]))
                if not update or not os.path.isfile(filename):
                    grams = ngram_step(doc, ngrams_config, force=force)
                    merged = []
                    for g in grams.values():
                        merged.extend(g)
                    doc_grammed.append(merged)
                    all_grams.extend(merged)
                else:
                    error = "\n| Load document as already normalized."
                    with open(filename, 'r') as f:
                        all_grams.extend(f.read().split())
                        doc_grammed.append(None)
                        f.close()
                progress.update(task, advance=1, error=error, doc=corpus_id[i])
    except Exception:
        console.print_exception()
    print(TAB + "> Compute ngrams... [green][DONE]")

    f = Counter(all_grams)
    with open(os.path.join(output_folder, 'full_dictionary.txt'), 'w') as outfile:
        json.dump(f, outfile, indent=4, sort_keys=True)
    print(TAB + '> Save the full dictionary [green][DONE]')

    with Progress(
            TAB + "> Save normalized documents... [IN PROGRESS]\n",
            BarColumn(30),
            TimeRemainingColumn(),
            "| Document [blue]{task.fields[doc]} [white]({task.completed}/{task.total})"
            "{task.fields[error]}",
            transient=True,
            console=console
    ) as progress:
        task = progress.add_task("Compute tokens...", total=len(doc_grammed), error="", doc=corpus_id[0])
        for i, doc in enumerate(doc_grammed):
            if doc is not None:
                with open(os.path.join(output_folder, '{}_normalized.txt'.format(corpus_id[i])), 'a') as file:
                    file.write(' '.join(doc))
            progress.update(task, advance=1, error=error, doc=corpus_id[i])
    print(TAB + '> Save normalized documents... [green][DONE]')