Exemplo n.º 1
0
def find_organisations_reasons(folder: str):
    """ Go through files in the given folder, extract organisation names
        and their reason for appearance in file. """
    org_reasons, org_counts = {}, {}
    try:
        # Get flair models.
        ner_tagger, frame_tagger, pos_tagger = get_flair_taggers()
        # Fetch results from cache, if present.
        files_processed, org_reasons, org_counts = check_cache()
        file_count = 1 if len(files_processed) == 0 \
            else len(files_processed) + 1
        # Find files to process from path.
        files = glob.glob(f"{folder}/*.txt")
        print(f"Processing {len(files)} files in '{folder}'.")
        # Remove previously processed file names.
        to_process = [f for f in files if f not in files_processed]
        for path in to_process:
            print(f"[{file_count}/{len(files)}] Processing {path}...")
            file = open(path, "r")
            # Go through paragraphs sentence by sentence and extract information.
            paragraphs = process(file.read())
            for sentences_tokenized in paragraphs:
                for tokens in sentences_tokenized:
                    sentence = ""
                    for token in tokens:
                        sentence += f"{token.spacing}{token.value}"
                    sentence = Sentence(sentence.strip())
                    # Add NER, POS and Semantic Frame Detection tags to sentence.
                    ner_tagger.predict(sentence)
                    frame_tagger.predict(sentence)
                    pos_tagger.predict(sentence)
                    # Extract all organisations.
                    organisations = get_organisations(sentence)
                    if not organisations:
                        continue

                    # Find the first organisation occurence and its reason for appearance.
                    for first in organisations[:1]:
                        name = clean_organization(first.text)
                        reason = get_reason_for_appearance(first, sentence)
                        add_to_organisation(name, reason, org_counts,
                                            org_reasons)

                    # Count remaining organisations, but don't find its reason for appearance,
                    # since the other organisations following the first one don't have meaningful reasons,
                    # leading to broken sentences.
                    for remaining in organisations[1:]:
                        name = clean_organization(remaining.text)
                        add_to_organisation(name, None, org_counts,
                                            org_reasons)

            files_processed.append(path)
            # Store in cache after processing.
            dump_to_cache(files_processed, org_reasons, org_counts)
            file_count += 1

        if (org_reasons['I']):
            org_reasons.pop('I', None), org_counts.pop('I', None)
        if (org_reasons['We']):
            org_reasons.pop('We', None), org_counts.pop('We', None)

        print(f"\nFinished processing {file_count} files.")
        return org_reasons, org_counts
    except Exception as e:
        # Handle early exit by user (CTRL+C).
        print(e)
        print("\n\nExiting...")
        print(f"Finished processing {file_count} files.")
        return org_reasons, org_counts