def chunkScript(movie, max_len=300):
    """
    add `[SEP]` token at the end of each sentence in script
    """
    outscript = ['']
    # movie = re.sub('\s+',' ', movie)
    prev_len = 0
    try:
        segmentation = list(segmenter.process(movie))

        for paragraph in segmentation:
            for s in paragraph:
                sent = ' '.join([t.value for t in s]) + ' [SEP]'
                if (prev_len + len(s) < max_len - 2):
                    outscript[-1] += sent
                    prev_len += len(s)
                elif (len(s) < max_len - 2):
                    outscript.append(sent)
                    prev_len = len(s)
                else:
                    # new sentence is longer than max_len
                    # print(len(outscript))
                    outscript += [
                        ' '.join(__extractValueFromToken__(s2))
                        for s2 in chunkset(s, max_len - 1)
                    ]
                    # print(len(outscript))
                    prev_len = len(outscript[-1])
    except:
        print("segmentatin error for movie:", movie)
        return []

    return outscript
示例#2
0
def main():
    parser = argparse.ArgumentParser(description='Preprocess corpus')
    parser.add_argument('--config_file', type=str,
                        default='configs/echoes_local.config',
                        help='location of the configuration file')
    args = parser.parse_args()

    config = configparser.ConfigParser()
    config.read(args.config_file)

    orig_dir = config['general']['orig_dir']
    logging.info(f'Preprocessing corpus files under: {orig_dir}')
    corpus_file = config['general']['corpus_file']
    max_files = int(config['general']['max_files'])

    fns = glob.glob(f'{orig_dir}/*.txt')
    if max_files > -1:
        fns = list(fns)[:max_files]
    
    metadata = pd.read_excel(config['general']['metadata'])
    metadata.fillna('', inplace=True)
    metadata.set_index('filepath', inplace=True)
    
    with open(corpus_file, 'w') as corpusf:
        for fn in tqdm(fns):
            with open(fn, encoding='utf-8-sig') as f:
                # try to extract metadata:
                md = {'author': 'Unknown', 'title': 'Unknown'}
                try:
                    r = metadata.loc[os.path.basename(fn).replace('.txt', '')]
                    if r['author:lastname']:
                        md['author'] = r['author:lastname']
                    if r['title:detail']:
                        md['title'] = r['title:detail']
                except KeyError:
                    pass
                
                # extract sentences:
                sentences = []
                for paragraph in segmenter.process(f.read()): 
                    for sentence in paragraph:
                        # get original sentence:
                        string_repr = ''
                        for token in sentence:
                            string_repr += f'{token.spacing}{token.value}'
                        string_repr = WHITESPACE.sub(' ', string_repr).strip()

                        # get individual tokens:
                        tokens = [t.value.strip() for t in sentence]
                        tokens = [t for t in tokens if t]

                        if tokens and string_repr:
                            sentences.append({'sentence': string_repr,
                                              'tokens': tokens})
                if sentences:
                    sentences = {'metadata': md,
                                 'sentences': sentences}
                    corpusf.write(json.dumps(sentences) + '\n')

    logging.info(f'Finished tokenizing corpus to: {corpus_file}')
示例#3
0
def tokenize(text, lemma_stop=False):

    sentenceC = []
    for parg in segmenter.process(text):
        for sentence in parg:
            sentenceC.append(' '.join([token.value for token in sentence]))
    sub = sentenceC

    return sentenceC
示例#4
0
def segment_abstract(text_stream, segout_name):
    segmented_outfile = open("pubmed_seg_txts/" + segout_name,
                             "a+",
                             encoding='utf-8')
    for paragraph in segmenter.process(text_stream):
        for sentence in paragraph:
            for token in sentence:
                segmented_outfile.write(token.value + ' ')
            segmented_outfile.write('\n')
        print()
    segmented_outfile.close()
示例#5
0
    def test_process(self):
        for paragraph in segmenter.process(DOCUMENT):
            offset = 0

            for sentence in paragraph:
                for token in sentence:
                    if token.value and token.value != "tincidunt":  # tin-cidunt linebreak!
                        new_offset = DOCUMENT.find(token.value, offset)
                        self.assertNotEqual(new_offset, -1,
                                            repr(token) + " at %d" % offset)
                        offset = new_offset + len(token.value)
示例#6
0
def segment(text: str) -> List[List[ConllToken]]:
    sentences = []
    for paragraph in segmenter.process(text):
        for sentence in paragraph:
            sent: List[ConllToken] = []
            sentences.append(sent)
            for i, token in enumerate(sentence):
                sent.append(ConllToken(space_prefix=token.spacing, value=token.value, offset=token.offset))
                if i > 0:
                    sent[-2].space_after = sent[-1].spacing == ' '

    return sentences
示例#7
0
def process_file(input_file, clean_string=False, verbose=False):
    """Read an input (html) file from disk and process to paragraph-size chunks"""

    # Create labels from file path
    labels = '+'.join(input_file.split("/")[2:4])

    # Open input file
    with input_file.open() as f:
        file_as_string = f.read()

    # Process html
    paragraphs_from_html = '\n\n'.join(html2paragraph(file_as_string))
    paragraphs = segmenter.process(paragraphs_from_html)

    paragraphs_new = []
    for k, paragraph in enumerate(paragraphs):
        paragraph_new = []
        for sentence in paragraph:
            sentence_new = []
            for token in sentence:
                sentence_new.append(token.spacing + token.value)
            # Join
            sentence_new_joined = "".join(sentence_new)
            # Clean
            if clean_string:
                # Remove citations
                sentence_new_joined = re.sub(r'\[[0-9]{1,3}\]', '',
                                             sentence_new_joined)
                # Remove anything between round brackets
                sentence_new_joined = re.sub(r'\(.*\)', '',
                                             sentence_new_joined)
                # Remove anything between square brackets
                sentence_new_joined = re.sub(r'\[.*\]', '',
                                             sentence_new_joined)
            if not sentence_new_joined.endswith("Wikipedia") and len(
                    sentence_new_joined.split(" ")) > MIN_SENTENCE_TOKENS:
                paragraph_new.append(sentence_new_joined)
        # Paragraph must be > 5
        if len(" ".join(paragraph_new).split(" ")) > MIN_PARAGRAPH_TOKENS:
            paragraphs_new.append("".join(paragraph_new))

    # Print
    if verbose:
        if len(paragraphs_new) > 0:
            print(
                "\t[==>] Snippets: [{}]\n\t[==>] Sample: {}\n\t[==>] Length: {}\n"
                .format(len(paragraphs_new), paragraphs_new[0],
                        len(paragraphs_new[0].split(" "))))

    # Return
    return (paragraphs_new, labels)
示例#8
0
def fasttext_iter_all_documents(input):
    import syntok.segmenter as segmenter
    for article in input.glob('./*/*/*'):
        label = '__label__' + '-'.join(
            tokenize('-'.join(str(article).split('/')[1:3])))
        print('* handling {} for {}'.format(article, label))

        with article.open() as f:
            string = f.read()
        paragraphs = '\n\n'.join(html2paragraph(string))
        paragraphs = segmenter.process(paragraphs)
        for paragraph in paragraphs:
            for sentence in paragraph:
                sentence = ' '.join(token.value for token in sentence)
                yield label, sentence
示例#9
0
def fix_sentences(text):
    out = ''

    for paragraph in segmenter.process(text):
        for sentence in paragraph:
            for token in sentence:
                # roughly reproduce the input,
                # except for hyphenated word-breaks
                # and replacing "n't" contractions with "not",
                # separating tokens by single spaces
                out += token.value + ' '
            out += '\n'  # print one sentence per line
        out += '\n'

    return out
示例#10
0
def fasttext_estimate(input):
    import fastText as ft
    import syntok.segmenter as segmenter
    model = ft.load_model(input)
    string = string = sys.stdin.read()
    paragraphs = '\n\n'.join(html2paragraph(string))
    paragraphs = segmenter.process(paragraphs)
    counter = Counter()
    for paragraph in paragraphs:
        for sentence in paragraph:
            sentence = ' '.join(token.value for token in sentence)
            out = model.predict(sentence, 73)  # number of labels
            out = {out[0][i]: out[1][i] for i in range(73)}
            counter = counter + Counter(out)
    for key, value in counter.most_common(10):
        print('{}\t\t{}'.format(key, value))
示例#11
0
def sentence_segmenter(document):
    sentences = []
    for paragraph in segmenter.process(document):
        for sentence in paragraph:
            s_sentence = ""
            for token in sentence:
                # roughly reproduce the input,
                # except for hyphenated word-breaks
                # and replacing "n't" contractions with "not",
                # separating tokens by single spaces
                # print(token.value, end=' ')
                s_sentence += token.value + " "
            # print()  # print one sentence per line
            sentences.append(s_sentence)
        # print()  # separate paragraphs with newlines

    return "\n".join(sentences)
def tokenize_input_doc(input_doc,
                       token_lower=True,
                       token_lower_digit=False,
                       max_sentences=15):
    # Sentences
    sent_out = list()
    a = segmenter.process(input_doc)
    # For each paragraph, do ...
    for par in a:
        # For each sentence in the paragraph, do ...
        for sent in par:
            csent = "".join([token.spacing + token.value
                             for token in sent]).strip()
            text_clean = tokenize_text(csent, token_lower, token_lower_digit)
            if text_clean is None: continue
            sent_out.append(text_clean)
    # Vectorize each input document
    docs_vectorized = tokenizer.texts_to_sequences(sent_out)
    return (docs_vectorized)
示例#13
0
def find_organisations_reasons(folder: str):
    """ Go through files in the given folder, extract organisation names
        and their reason for appearance in file. """
    org_reasons, org_counts = {}, {}
    try:
        # Get flair models.
        ner_tagger, frame_tagger, pos_tagger = get_flair_taggers()
        # Fetch results from cache, if present.
        files_processed, org_reasons, org_counts = check_cache()
        file_count = 1 if len(files_processed) == 0 \
            else len(files_processed) + 1
        # Find files to process from path.
        files = glob.glob(f"{folder}/*.txt")
        print(f"Processing {len(files)} files in '{folder}'.")
        # Remove previously processed file names.
        to_process = [f for f in files if f not in files_processed]
        for path in to_process:
            print(f"[{file_count}/{len(files)}] Processing {path}...")
            file = open(path, "r")
            # Go through paragraphs sentence by sentence and extract information.
            paragraphs = process(file.read())
            for sentences_tokenized in paragraphs:
                for tokens in sentences_tokenized:
                    sentence = ""
                    for token in tokens:
                        sentence += f"{token.spacing}{token.value}"
                    sentence = Sentence(sentence.strip())
                    # Add NER, POS and Semantic Frame Detection tags to sentence.
                    ner_tagger.predict(sentence)
                    frame_tagger.predict(sentence)
                    pos_tagger.predict(sentence)
                    # Extract all organisations.
                    organisations = get_organisations(sentence)
                    if not organisations:
                        continue

                    # Find the first organisation occurence and its reason for appearance.
                    for first in organisations[:1]:
                        name = clean_organization(first.text)
                        reason = get_reason_for_appearance(first, sentence)
                        add_to_organisation(name, reason, org_counts,
                                            org_reasons)

                    # Count remaining organisations, but don't find its reason for appearance,
                    # since the other organisations following the first one don't have meaningful reasons,
                    # leading to broken sentences.
                    for remaining in organisations[1:]:
                        name = clean_organization(remaining.text)
                        add_to_organisation(name, None, org_counts,
                                            org_reasons)

            files_processed.append(path)
            # Store in cache after processing.
            dump_to_cache(files_processed, org_reasons, org_counts)
            file_count += 1

        if (org_reasons['I']):
            org_reasons.pop('I', None), org_counts.pop('I', None)
        if (org_reasons['We']):
            org_reasons.pop('We', None), org_counts.pop('We', None)

        print(f"\nFinished processing {file_count} files.")
        return org_reasons, org_counts
    except Exception as e:
        # Handle early exit by user (CTRL+C).
        print(e)
        print("\n\nExiting...")
        print(f"Finished processing {file_count} files.")
        return org_reasons, org_counts
示例#14
0
def extract_formatted_sentences(document):
    sentences = []
    for paragraph in segmenter.process(document):
        for sentence in paragraph:
            sentences.append(" ".join([token.value for token in sentence]))
    return sentences
示例#15
0
def extract_paragraphs(document):
    return [paragraph for paragraph in segmenter.process(document)]
示例#16
0
def handle_page_paragraphs(page_num, page_text, doc_dict):
    for paragraph_num, paragraph in enumerate(segmenter.process(page_text)):
        paragraph_text = get_paragraph_text(paragraph)
        paragraph_dict = create_paragraph_dict(page_num, paragraph_num,
                                               paragraph_text, doc_dict)
        doc_dict["paragraphs"].append(paragraph_dict)
示例#17
0
def _segment_text_into_sentences(raw_sentence: str):
    sentences = []
    for segmented_sentences in segmenter.process(raw_sentence):
        for sentence in segmented_sentences:
            sentences.append("".join(map(str, sentence)).lstrip())
    return sentences
示例#18
0
def _segment_text_into_sentences(article: str):
    sentences = []
    for paragraph in segmenter.process(article):
        for sentence in paragraph:
            sentences.append("".join(map(str, sentence)).lstrip())
    return sentences
示例#19
0
 if lnsp[0] != previous_docnr:
     if sentcount < 15:
         # Take doc number
         docnr = lnsp[0]
         # Add to inputs
         inputs[lbl0][docnr] = sent_level
         # Reset
         sent_level = list()
         sentcount = 0
         docfinish = False
 # Set previous doc number to current
 previous_docnr = lnsp[0]
 # Process each sentence of paragraph, unless already have enough sentences
 if docfinish: continue
 # Else, use syntok to segment a paragraph into sentences
 a = segmenter.process(lnsp[-1])
 # For each paragraph, do ...
 for par in a:
     # For each sentence in the paragraph, do ...
     for sent in par:
         csent = "".join(
             [token.spacing + token.value for token in sent]).strip()
         # Tokenize text
         txt_tok = tokenize_text(
             csent,
             lower_tokens=args.token_lower,
             remove_digits_token=args.token_remove_digits)
         # If none, pass ...
         if txt_tok is None:
             failed.append(csent)
             continue
示例#20
0
def extract_sentences(document):
    sentences = []
    for paragraph in segmenter.process(document):
        for sentence in paragraph:
            sentences.append(sentence)
    return sentences