def from_file(self, filename):
        """Convert the file to sentence tokens.

        This is dangerous if the file is very large. If memory was a bottlneck,
        we'd need to be trickier about reading in chunks, checking for valid
        sentences, and resetting the file pointer to the last found sentence.
        """
        with open(filename, 'r') as data:
            return self.from_text(data.read())
Exemplo n.º 2
0
def pre_process_single_file(original_file, discourse_project_input_dir):
    # Checks if output directory exists, if not creates one
    if not os.path.isdir("output"):
        os.mkdir("output")

    # Extracts just the file name from the path
    filename = original_file.split(os.path.sep)[-1]

    with open(original_file, mode='r', encoding='utf-8') as data:
        originalData = data.read()
        restructured_text = re_structure_text(originalData)
        create_txt_file(restructured_text, filename)
        xml_filename = createXmlDocument_v2(restructured_text, filename, path=discourse_project_input_dir)
        read_xml_file(xml_filename, filename)
Exemplo n.º 3
0
def read_file(path, stop_words):
    translator = str.maketrans('', '', string.punctuation)
    data = open(path)
    file = data.read()
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    text = sent_detector.tokenize(file.strip())

    sentences = []
    for sentence in text:
        new_sentence = sentence.translate(translator)
        sentences.append(word_tokenize(new_sentence.lower()))
    sentences.pop()

    new_file = file.translate(translator)
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(new_file.lower())
    wordl = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    word_list = list(wordl)

    return text, sentences, word_list
Exemplo n.º 4
0
# download the English tokenizer from NLTK
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

DATA_DIR = "../data/OANC-GrAF/data/written_1/journal/verbatim_txt/"
CSV_FILE_NAME = 'verbatim.csv'

csv_file = open(CSV_FILE_NAME, 'w', newline='')
csv_writer = csv.writer(csv_file, delimiter=',')
csv_writer.writerow(
    ['file_id', 'num_sent', 'word_count', 'count_the', 'count_this', 'count_that', 'count_these', 'count_those', 'count_a', 'count_an', 'count_one', 'freq_the', 'freq_this', 'freq_that', 'freq_these', 'freq_those', 'freq_a', 'freq_an', 'freq_one'])

with os.scandir(DATA_DIR) as all_files:
    for entry in all_files:
        data = open(os.path.join(DATA_DIR, entry.name))

        tokenized_sentences = sent_detector.tokenize(data.read().replace("\n", " ").replace("\t", ""))
        #print(tokenized_sentences)
        num_sent = len(tokenized_sentences)

        word_count = {
            "the" : 0,
            "this" : 0,
            "that" : 0,
            "these" : 0,
            "those" : 0,
            "a" : 0, 
            "an" : 0, 
            "one" : 0
        }

        all_word_count = 0
Original source: https://gist.github.com/Ghost---Shadow/c361f2d6b4501f40648b#file-plag-py
@author: Quan
"""

from nltk.corpus import wordnet
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import re
from nltk.tokenize import word_tokenize
from random import randint
import nltk.data
from preprocessing.tokenizer_rinehart import tokenizer

# Load a text file if required from ./data folder
path = './data/Doyle.txt'
data = open(path, "r+")
text = data.read()
# text = "Pete ate a large cake. Sam has a big mouth."

# Tokenize the text
tokenized = tokenizer(text)

# Get the list of words from the entire text
words = word_tokenize(text)

# Identify the parts of speech
tagged = nltk.pos_tag(words)


def replace(words, tagged):
    output = ""
    for i in range(0, len(words)):