Exemplo n.º 1
0
def main(args):
  corpus = CHILDESCorpusReader(args.dir, args.glob)
  for fileid in corpus.fileids():
    for sentence in corpus.words(fileid, relation=True):
      try:
        print_conllu(sentence, sys.stdout)
      except:
        # Some of the sentences bork because the parses aren't complete. Oh well.
        pass
Exemplo n.º 2
0
def scandirs(path, d):

    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(speaker='MOT')  #only the parents words
            for i in li:
                if (i != 'xxx'):
                    d[len(i)].append(i)
Exemplo n.º 3
0
def TextLoader(file_locations, cds):
    verb_counter = {}

    for loc in file_locations:
        dir = re.sub(r"(.+/)[^/]+$", r"\1", loc)
        fileid = re.sub(r".+/([^/]+)$", r"\1", loc)
        corpus = CHILDESCorpusReader(dir, fileid)
        age = corpus.age(fileids=fileid, month=True)
        name = loc.split("/")[-2]
        if name not in verb_counter:
            verb_counter[name] = {}

        if (cds == True):
            spkrs = [spkr for spkr in corpus.participants(fileid)[0].keys() \
                 if spkr != "CHI"]
        else:
            spkrs = ["CHI"]
        #sents = corpus.sents(speaker = spkrs)
        tagged_words = corpus.tagged_words(speaker=spkrs,
                                           stem=True,
                                           replace=True)
        words = [
            word[0] for word in tagged_words
            if (len(word[1]) > 0 and word[1][0] == "v")
        ]

        try:
            age = int(age[0])
            if age not in verb_counter[name]:
                verb_counter[name][age] = Counter(words)
            else:
                verb_counter[name][age] += Counter(words)
        except:
            None
        '''
        for word in words:
            if (word[1][0] == "v"):
                yield(word[0], age[0], name)
                '''
        #cleaned_sent = []
        #for stem in s:
        #stem = re.sub(r'-[^~]+', "", stem)
        #if "~" in stem:
        #cleaned_sent += stem.split("~")
        #else:
        #cleaned_sent.append(stem)

        #yield (" ".join(s), age[0], name)
    return verb_counter
Exemplo n.º 4
0
def scandirs(path, part_ofspeech, dependencies):
    corpus_root = nltk.data.find(path)
    for currentFile in glob.glob(os.path.join(path, '*')):
        if os.path.isdir(currentFile):
            print(currentFile)
            s = sentence_cut(currentFile)  # returns name of kid (directory)
            s += '/.*.xml'
            print(s)
            manchester = CHILDESCorpusReader(corpus_root, s)
            li = manchester.words(relation=True,
                                  speaker='MOT')  #only the parents words
            for i in li:
                for k in i:
                    if (len(k) >= 3):
                        depen(k, dependencies)
                    partOfSpeech(k, part_ofspeech)
Exemplo n.º 5
0
def TextLoader(file_locations, cds):
    for loc in file_locations:
        dir = re.sub(r"(.+/)[^/]+$", r"\1", loc)
        fileid = re.sub(r".+/([^/]+)$", r"\1", loc)
        corpus = CHILDESCorpusReader(dir, fileid)
        age = corpus.age(fileids=fileid, month=True)
        name = loc.split("/")[-2]
        if (cds == 1):
            spkrs = [spkr for spkr in corpus.participants(fileid)[0].keys() \
                 if spkr != "CHI"]
        else:
            spkrs = ["CHI"]

        sents = corpus.sents(speaker=spkrs, stem=True, replace=True)
        for s in sents:

            yield (" ".join(s), age[0], name)
Exemplo n.º 6
0
 def _process_metadata(self):
     '''
     copy the child and speaker metadata dicts to the CHILDESSentence object's 
     internal dictionary, prepending 'child_' and 'speaker_' accordingly
     
     This exposes the child and speaker metadata as object attributes.
     '''
     
     for k, v in self.child_metadata.items():
         if k != 'age':
             self.__dict__['child_'+k] = v
         else:
             self.__dict__['child_'+k] = CHILDESCorpusReader('','').convert_age(v)
     
     for k, v in self.speaker_metadata.items():
         self.__dict__['speaker_'+k] = v
Exemplo n.º 7
0
#### add overlap presence, run on MPI-EVA-manchester

import csv
import nltk
from nltk.parse import TestGrammar
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.compat import string_types

NS = 'http://www.talkbank.org/ns/talkbank'

corpus_root = nltk.data.find('corpora/childes/Eng-UK')
manchester_corpus_root = nltk.data.find(
    'corpora/childes/Eng-UK/MPI-EVA-Manchester')
thomas = CHILDESCorpusReader(corpus_root, 'Thomas/.*.xml')
eleanor = CHILDESCorpusReader(manchester_corpus_root, 'eleanor/.*.xml')
fraser = CHILDESCorpusReader(manchester_corpus_root, 'fraser/.*.xml')

corpus_rt_total = 0
corpus_rt_num = 0
corpus_rt_avg = 0
corpus_noerr_rt_total = 0
corpus_noerr_rt_num = 0
corpus_noerr_rt_avg = 0
corpus_err_rt_total = 0
corpus_err_rt_num = 0
corpus_err_rt_avg = 0

corpus_total_errs = 0
Exemplo n.º 8
0
def get_corpus_reader(language):
    return CHILDESCorpusReader(corpus_root,
                               r'%s.*/.*\.xml' % language[:3].title())
Exemplo n.º 9
0
    return file_folder


# Iterates through the directory
xml_files = xtract_XML_files(directory_path)

# Creates a CSV file
with open(directory + '_' + file_name + '.csv', 'w') as csvfile:
    fieldnames = ['Corpus', 'File', 'Name', 'Verb', 'Age', 'Sent',
                  'Syntatic Object', 'Event or Object']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for xml_folder in xml_files:

        corpus_folder = CHILDESCorpusReader(xml_folder, '.*.xml')

        # Stores the data of the corpra
        corpus_data = corpus_folder.corpus(corpus_folder.fileids())

        # Prints out corpus & child information
        for file in corpus_folder.fileids():

            # Stores all the sentences spoken by the speaker
            corpus_sents = corpus_folder.sents(file, speaker=speaker)

            # Stores all the sentences, words in stem form
            corpus_sents_stems = corpus_folder.sents(file, speaker=speaker,
                                                stem=True)

            corpus_participant = corpus_folder.participants(file)
Exemplo n.º 10
0
"""
The main driver function for data processing, and collecting features.
"""
if __name__ == '__main__':
    t = time.time()  # Initialization
    output = []
    d = cmudict.dict()
    parser = English()

    # get corpus directories
    corpus_root_xml = nltk.data.find(
        'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml')
    corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text'

    # get all xml and plain text files from specified directories
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]
def read_files_by_age(file):
    phrase = []
    sentences = []
    canonical_sentences = []
    tuple = []

    unknown = ['xxx', 'www', 'mm']
    ignoredMotPOS = ['chi', 'fam', 'neo']

    with open(path + file) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=';')
        for row in reader:
            tuple.append((row['path']))

    for i1 in tuple:
        source = CHILDESCorpusReader(corpus_root, i1)
        sents_chi_pos = source.tagged_sents(speaker='CHI', replace=True)

        for i2 in sents_chi_pos:
            for j1 in i2:
                if (j1[0] not in unknown) & (j1[0] != '') & (
                        j1[1] not in ignoredMotPOS):
                    phrase.append(j1[0])

            if phrase != []:
                t_sentences = []
                treated_sentence = []
                for i3 in phrase:
                    res = re.search(r"(?!'.*')\b[\w']+\b", i3)
                    t_sentences.append(res.group(0))
                treated_sentence.append(t_sentences)
                sentences.append(t_sentences)

                for i4 in treated_sentence:

                    splited_phrase = []
                    for j2 in i4:
                        rep = replacer.replace(j2)
                        if ' ' in rep:
                            a, b = rep.split(' ')
                            splited_phrase.append(canonicalTag(a))
                            splited_phrase.append(canonicalTag(b))
                        else:
                            splited_phrase.append(canonicalTag(rep))
                    canonical_sentences.append(splited_phrase)
                # sentences = []
            phrase = []

    # target = open('Corpus/ByAge/Original/sentence_' + file.split('.')[0] + '.txt', encoding='utf-8')
    # target_canonical =  open('Corpus/ByAge/Canonical/canonical_sentence_' + file.split('.')[0] + '.txt' , mode='w' , encoding='utf-8')

    # shutil.copyfileobj(source, target)

    target = open(
        'Corpus/Sentences/ByAge/Original/sentence_' + file.split('.')[0] +
        '.txt', 'w')
    target_canonical = open(
        'Corpus/Sentences/ByAge/Canonical/canonical_sentence_' +
        file.split('.')[0] + '.txt', 'w')

    for i5 in sentences:
        for j4 in i5:
            target.write(j4 + ' ')
        target.write('\n')
    target.close()
    print('terminei (original)')

    for i6 in canonical_sentences:
        for j5 in i6:
            target_canonical.write(j5 + ' ')
        target_canonical.write('\n')
    target_canonical.close()
    print('terminei (canonico)')
Exemplo n.º 12
0
def extraction_sentences():
    '''
        Processes all datasets individually by children.
        Processa o todos os datasets individualmente por crianças.
    '''

    # lists all directories (dataset).
    for index, dataset in enumerate(os.listdir(dir_all_dataset)):
        print '======================================================'
        path_dataset = dir_all_dataset + dataset
        print '> Dataset {} : {} \n'.format(index, dataset)

        # lists all subdirectories (children).
        for dir_child in os.listdir(path_dataset):
            # check if it's a directory.
            if os.path.isdir(os.path.join(path_dataset, dir_child)):
                print '>> Child:', dir_child
                '''
                    Read all .XML files by extracting only child's speech and saving in a new .TXT file.
                    Ler todos os arquivos .XML extraindo somente a fala da criança e salva em um novo arquivo .TXT .
                '''
                path_files_xml = path_dataset + '/' + dir_child + '/xml'
                corpus_root = nltk.data.find(path_files_xml)
                ccr = CHILDESCorpusReader(corpus_root, '/.*.xml')

                file_speaks_child = open(
                    path_dataset + '/' + dir_child + '/' + dir_child.lower() +
                    '_speaks_chi.txt', 'w')

                for file_xml in os.listdir(path_files_xml):
                    if file_xml.endswith('.xml'):
                        age = ccr.age(file_xml)  # get age child.
                        age = str(age)  # coverte em string

                        if age != '[None]':  # check age is None.
                            # model age: P 2Y 4M 9D | P 1Y 11M 29D
                            new_age = []

                            # out = P-1Y-1M-29D-
                            for ch in str(age):
                                new_age.append(ch)
                                if ch.isalpha():
                                    new_age.append('-')

                            new_age = ''.join(new_age)  # split.
                            # clear.
                            new_age = new_age.replace('[', '').replace(
                                ']', '').replace("'", '')

                            # separa as cadeias
                            P, Y = new_age.split('-', 1)

                            if Y[0] == age_child_limit:  # check age child is >= age_child_limit
                                sentences = ccr.sents(file_xml,
                                                      speaker=['CHI'])

                                for sentence in sentences:
                                    try:
                                        file_speaks_child.write(
                                            str(" ".join(sentence) + '\n'))
                                    except:
                                        file_speaks_child.write(
                                            str(" ".join(sentence).encode(
                                                'utf-8') + '\n'))

                file_speaks_child.close()

                print '>> Extracted child speech: ' + dir_child.lower(
                ) + '_speaks_chi.txt'
                '''
                    Create a new file with the sentences that have five or more tokens.
                    Crie um novo arquivo com as frases que tenham cinco ou mais tokens.
                '''
                if dir_child != 'xml':
                    list_sentences_temp = []

                    file_input = dir_child.lower() + '_speaks_chi.txt'
                    file_output = dir_child.lower(
                    ) + '_speaks_chi_selected.txt'

                    with open(path_dataset + '/' + dir_child + '/' +
                              file_input) as document:
                        for line in document:
                            sentence = to_treat_bigram(line.split())
                            sentence = remove_stops_words(sentence.split())
                            sentence = check_sentence(sentence)
                            # spellchecker(sentence.split())

                            if sentence != '' and sentence != None and len(
                                    sentence.split()) >= tokens_limit:
                                # Add ponto final nas sentenças e salva em uma lista temporária.
                                list_sentences_temp.append(
                                    sentence.capitalize() + '.')

                    # Remove sentenças repetidas.
                    list_sentences_temp = set(list_sentences_temp)

                    # Salva as sentenças selecionadas.
                    file_speaks_child_selected = open(
                        path_dataset + '/' + dir_child + '/' + file_output,
                        'w')  # a+
                    for sent in list_sentences_temp:
                        file_speaks_child_selected.write(str(sent) + '\n')
                    file_speaks_child_selected.close()

                    print '>> Selected child speech: ' + dir_child.lower(
                    ) + '_speaks_chi_selected.txt'

                print ''

        print ''
Exemplo n.º 13
0
# CHILDES Corpus

- NLTK can deal with the xml format of the CHILDES corpus
- CHILDES xml is available at [https://childes.talkbank.org/data-xml/](https://childes.talkbank.org/data-xml/)


import nltk
from nltk.corpus.reader import CHILDESCorpusReader
r = CHILDESCorpusReader('../../../Corpus/CHILDES/Chinese/Chang1_xml/', '.*.xml')

r.fileids()

# print basic profile for each xml
for f in r.fileids()[:5]:
    cur_corpus = r.corpus(f)[0]
    print(cur_corpus['Corpus'],
          cur_corpus['PID'],
         cur_corpus['ActivityType'],
         cur_corpus['Date'])
    print("Num of Words: {}".format(len(r.words(f))))
    print("Num of Sents: {}".format(len(r.sents(f))))

# participants
r.participants(fileids=r.fileids()[10])[0]# first file participants

all_speakers = r.participants()

for speakers_cur_file in all_speakers[:5]:
    print("====")
    for spid in speakers_cur_file.keys():
        cur_spid_data = speakers_cur_file[spid]
Exemplo n.º 14
0
def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')
    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    print brown_corpus.fileids()[:5]

    fileids = ['02b.xml', '03a.xml']

    print brown_corpus.age(fileids=fileids)
    print brown_corpus.MLU(fileids=fileids)

    print {fid: brown_corpus.age(fileids=fid)[0] for fid in fileids}
    print {fid: brown_corpus.MLU(fileids=fid)[0] for fid in fileids}

    print {
        fid: brown_corpus.age(fileids=fid, month=True)[0]
        for fid in fileids
    }

    metadata = brown_corpus.participants(fileids='03a.xml')[0]

    ## comment this if you don't have the pretty package
    print(metadata)

    ## uncomment this if you don't have the pretty package
    #print metadata

    print 'words:', brown_corpus.words(fileids='03a.xml')[:7]
    print 'sents:', brown_corpus.sents(fileids='03a.xml')[:3]

    print 'tagged words:', brown_corpus.tagged_words(fileids='03a.xml')[:7]
    print 'tagged sents:', brown_corpus.tagged_sents(fileids='03a.xml')[:3]

    print "Adam:", '\t', brown_corpus.sents(fileids='03a.xml',
                                            speaker='CHI')[:5]
    print "Mother:", brown_corpus.sents(fileids='03a.xml', speaker='MOT')[:2]

    mother_unstemmed = brown_corpus.sents(fileids='03a.xml', speaker='MOT')

    mother_stemmed = brown_corpus.sents(fileids='03a.xml',
                                        speaker='MOT',
                                        stem=True)
    mother_root = [[
        stemmed_word.split('-')[0] for stemmed_word in stemmed_sent
    ] for stemmed_sent in mother_stemmed]

    print 'Raw:\t\t', mother_unstemmed[:2]
    print 'Stemmed:\t', mother_stemmed[:2]
    print 'Roots only:\t', mother_root[:2]
import os
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find(
    '/Users/tessacharlesworth/Desktop/Embeddings/Raw Data/CHILDES raw text/'
)  # change to local directory where the raw text files are stored
childes = CHILDESCorpusReader(corpus_root, '\S*.xml')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

file_count = 0
writefile_children = open(
    "/Users/tessacharlesworth/Desktop/Embeddings/Clean Data/CHILDES clean text/corpus_children.txt",
    'w+'
)  # change to local directory where the combined text files should be stored; keep "corpus_children.txt", 'w+'" at the end
for (root, dirs, files) in os.walk(
        "/Users/tessacharlesworth/Desktop/Embeddings/Raw Data/CHILDES raw text/",
        topdown=True
):  # change to local directory where the raw text files are stored
    for file in files:
        if file[-4:] == '.xml':
            print(file)
            output = childes.words(os.path.join(root, file),
                                   speaker=['CHI'],
                                   replace=True)
            output = list(filter(lambda a: a != 'xxxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxxx', output))
            output = list(filter(lambda a: a != 'xxxxx', output))
Exemplo n.º 16
0
    'across', 'everything', 'maybe', 'big', 'little', 'nice', 'wow', 'new',
    'cool', 'else', 'ago', 'almost', 'another', 'ahead', 'always', 'already',
    'whoops', 'em', 'wan', 'much', 'nope', 'hum', 'anyways', 'yet', 'though',
    'somethin', 'cha', 'anything', 'somebody', 'may', 'still', 'uhoh', 'also',
    'instead', 'whose', 'without', 'behind', 'anybody', 'any', 'away', 'why',
    'please', 'yay', 'oops', 'any', 'please', 'another', 'something', 'very'
])
#sw = [stemmer.stem(item) for item in sw]

with open('animal.csv', 'rb') as f:
    reader = csv.reader(f)
    animal = []
    for row in reader:
        animal.extend(row)

childes = CHILDESCorpusReader(corpus_root, '.*.xml', lazy=False)
files = childes.fileids()
resultlist = []

for filename in files:
    sents = childes.sents(filename)[0]
    filew = []
    for sent in sents:
        result_lower = [item.lower() for item in sent]
        #result_stem = [stemmer.stem(item) for item in result_lower]
        result_clean = [
            item for item in result_lower
            if '\'' not in item and '_' not in item and len(item) > 1
        ]
        result = [item for item in result_clean if item not in sw]
        filew.extend(result)
Exemplo n.º 17
0
#### add overlap presence, run on MPI-EVA-manchester

import csv
import nltk
from nltk.parse import TestGrammar
from nltk.corpus.reader import CHILDESCorpusReader
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
from nltk.util import flatten, LazyMap, LazyConcatenation
from nltk.compat import string_types

NS = 'http://www.talkbank.org/ns/talkbank'

corpus_root = nltk.data.find('corpora/childes/Eng-UK')
manchester_corpus_root = nltk.data.find(
    'corpora/childes/Eng-UK/MPI-EVA-Manchester')
eleanor = CHILDESCorpusReader(manchester_corpus_root, 'eleanor/.*.xml')
fraser = CHILDESCorpusReader(manchester_corpus_root, 'fraser/.*.xml')


def getUtterance(xmlsent):
    utterance = ""
    for word in xmlsent.findall('.//{%s}w' % NS):
        if word.text != None:
            utterance = utterance + " " + word.text
    return utterance


def getRT(s1, s2):
    s1_media = s1.find('.//{%s}media' % NS)
    s2_media = s2.find('.//{%s}media' % NS)
    if s1_media != None and s2_media != None:
Exemplo n.º 18
0
#code from http://www.nltk.org/howto/childes.html

import nltk
from nltk.corpus.reader import CHILDESCorpusReader
corpus_root = ('/Users/callab/Documents/Projects/CHILDES/Corpora/')

def remove_non_ascii_1(text):
    return ''.join(i for i in text if ord(i)<128)

#providence = CHILDESCorpusReader(corpus_root, 'childes_corpora/Providence/.*.xml')
childes = CHILDESCorpusReader(corpus_root, '.*.xml')


#Some useful codes. 
#display the files names for Providence
childes.fileids()
#count the number of files
len(childes.fileids())
#printing properties of the corpus files
corpus_data = childes.corpus(childes.fileids())
print(corpus_data[0]['Lang'])
for key in sorted(corpus_data[1].keys()):
    print(key ,":", corpus_data[1][key])
#Printing participant information. CHI (target child), MOT(mother), INV (investigator)
#something is wrong in my print
corpus_participants = childes.participants(childes.fileids())
for this_corpus_participants in corpus_participants[3:5]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
#printing words
def main():
    nltk_download_dir = '/home/rodriguesfas/nltk_data'

    brown_corpus_root = os.path.join(nltk_download_dir,
                                     'corpora/CHILDES/Eng-NA-MOR/Valian/')

    brown_corpus = CHILDESCorpusReader(root=brown_corpus_root, fileids='.+xml')

    #exibe os arquivos
    print brown_corpus.fileids()

    #conta o numero de arquivos
    print len(brown_corpus.fileids())

    #exibe propriedade dos arquivos
    corpus_data = brown_corpus.corpus(brown_corpus.fileids())
    print(corpus_data[0]['Lang'])

    for key in sorted(corpus_data[0].keys()):
        print(key, ": ", corpus_data[0][key])

    # Imprimindo informações dos participantes do corpus.
    # Os códigos mais comuns para os participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
    corpus_participants = brown_corpus.participants(brown_corpus.fileids())
    for this_corpus_participants in corpus_participants[:2]:
        for key in sorted(this_corpus_participants.keys()):
            dct = this_corpus_participants[key]
            print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

    # printing words.
    print brown_corpus.words('01a.xml')

    # printing sentences.
    print brown_corpus.sents('01a.xml')

    #You can specify the participants with the argument speaker.
    print brown_corpus.words('01a.xml', speaker=['INV'])
    print brown_corpus.words('01a.xml', speaker=['MOT'])
    print brown_corpus.words('01a.xml', speaker=['CHI'])

    # tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
    # POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000).
    print brown_corpus.tagged_words('01a.xml')[:30]

    print brown_corpus.tagged_sents('01a.xml')[:10]

    # When the argument stem is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words.
    print brown_corpus.words('01a.xml')[:30]
    print brown_corpus.words('01a.xml', stem=True)[:30]

    #When the argument replace is true, the replaced words are used instread of the original words.
    print brown_corpus.words('01a.xml', speaker='CHI')[247]
    print brown_corpus.words('01a.xml', speaker='CHI', replace=True)[247]

    # When the argument relation is true, the relational relationships in the sentence are returned.
    # See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES.
    print brown_corpus.words('01a.xml', relation=True)[:10]

    # Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.
    print brown_corpus.age()
    print brown_corpus.age('01a.xml')
    print brown_corpus.age('01a.xml', month=True)

    # Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973).
    print brown_corpus.MLU()
    print brown_corpus.MLU('01a.xml')

    # Basic stuff
    # Count the number of words and sentences of each file.

    for this_file in brown_corpus.fileids()[:6]:
        print(
            brown_corpus.corpus(this_file)[0]['Corpus'],
            brown_corpus.corpus(this_file)[0]['Id'])
        print("num of words: %i" % len(brown_corpus.words(this_file)))
        print("num of sents: %i" % len(brown_corpus.sents(this_file)))
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-UK-MOR/')

belfast = CHILDESCorpusReader(corpus_root, 'Belfast/.*.xml')
cruttenden = CHILDESCorpusReader(corpus_root, 'Cruttenden/.*.xml')
manchester = CHILDESCorpusReader(corpus_root, 'Manchester/.*.xml')
tommerdahl = CHILDESCorpusReader(corpus_root, 'Tommerdahl/.*.xml')

print(len(belfast.fileids()))
print(len(cruttenden.fileids()))
print(len(manchester.fileids()))
print(len(tommerdahl.fileids()))
Exemplo n.º 21
0
    Date: 29/06/2018
    Author: RodriguesFAS
    Contact: <*****@*****.**>
    Tutorial: http://www.nltk.org/howto/childes.html 
              http://ling-blogs.bu.edu/lx390f17/standoff-annotation-xml-and-more-childes
              
"""
#%%
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find(
    '/home/rodriguesfas/Mestrado/workspace/specana.prototype/dataset/corpora/childes/data/eng-uk/Belfast/'
)

ccr = CHILDESCorpusReader(corpus_root, 'Barbara/.*.xml')

print ccr.fileids()

# Conte o número de arquivos.
print len(ccr.fileids())

# Imprimindo propriedades dos arquivos corpus.
corpus_data = ccr.corpus(ccr.fileids())
print(corpus_data[0]['Lang'])

for key in sorted(corpus_data[0].keys()):
    print(key, ": ", corpus_data[0][key])

# Imprimindo informações dos participantes do corpus. Os códigos mais comuns para os
# participantes são 'CHI' (filho alvo), 'MOT' (mãe) e 'INV' (investigador).
Exemplo n.º 22
0
import nltk
import csv

from nltk.corpus.reader import CHILDESCorpusReader

corpus_root = nltk.data.find('corpora/CHILDES/data-xml/Eng-UK-MOR/')
reader = CHILDESCorpusReader(corpus_root, '.*.xml')

# TODO: arquivos duplicados

file_age = []


def save_folder_by_age(path, age):
    size = 6
    base = int(age / size)
    with open(
            'Corpus/FolderByAge/' + 'age_' + str(base * size) + '_' +
            str(((base + 1) * size) - 1) + '.csv', 'a') as csvfile:
        fieldnames = ['pathInput', 'age']
        writer = csv.DictWriter(csvfile,
                                fieldnames=fieldnames,
                                delimiter=';',
                                lineterminator='\n')
        writer.writerow({'pathInput': path, 'age': age})


def get_age_in_months(arquivo):
    age = reader.age(arquivo)[0]
    year = 0
    month = 0
Exemplo n.º 23
0
import re, csv, os, nltk
import syllable3 as sy
from nltk.corpus import cmudict
from nltk.corpus.reader import CHILDESCorpusReader


## 1) load corpus
# Emma
#corpus_root = nltk.data.find('C:\\Users\\Emma\\AppData\\Roaming\\nltk_data\\corpora\\Brown')
#brown = CHILDESCorpusReader(corpus_root, '.*\\.*.xml')
# Andrew
corpus_root = nltk.data.find('/Users/apc38/Dropbox/workspace/Corpora/CHILDES/xml/BrownXML')
brown = CHILDESCorpusReader(corpus_root, '.*/.*.xml')
fileidlist = brown.fileids()

## 2) make a list of all participants other than children
partislist = brown.participants(fileidlist)
plist = []
patt = re.compile('CHI')
for pdict in partislist:
    for p in pdict.keys():
        if patt.match(p):
            print('ignoring child')
        else:
            print('not a child, this is', p)
            if p not in plist:
                plist.append(p)
                print('added to list, list is now', len(plist), 'items long')

## 3) for each file, get sentences and phoneticize using CMU pronunciation dictionary
transcr = cmudict.dict()