def create_bnc_data(): if not os.path.exists('domain/'): os.mkdir('domain/') print( "Please go to https://ota.bodleian.ox.ac.uk/repository/xmlui/handle/20.500.12024/2554" ) print("download, and save the BNC corpus file as {}".format( os.getcwd() + "/domain/bnc.zip")) #wget.download(bnc_url, 'domain/bnc.zip') input("Press ENTER when this is done to continue...") zip_file = zipfile.ZipFile('domain/bnc.zip', 'r') zip_file.extractall('domain/') bnc_reader = BNCCorpusReader(root="domain/download/Texts", fileids=r'[A-K]/\w*/\w*\.xml') sents = bnc_reader.sents() fout = open('domain/bnc.raw', 'w+') for sent in sents: fout.write(' '.join(sent) + '\n') fout.close() path = 'domain/' file_name = 'bnc.raw' num_lines = sum(1 for line in open(path + file_name)) filter_word_level(path, file_name, 'domain_dev/', 'bnc.dev', 0, num_lines // 2) filter_word_level(path, file_name, 'domain_test/', 'bnc.test', num_lines // 2 + 1, num_lines)
def it(): reader = BNCCorpusReader(fileids=path, root=self.root) words_tags = reader.tagged_words(stem=False) stems = (s for s, _ in reader.tagged_words(stem=True)) for (word, tag), stem in zip(words_tags, stems): yield Token(word, stem, tag)
def bnc_sentence_dump(root_path): """This process randomly dumps sentences in xmls into txt files under train, dev, test split (roughly 7:1:2) """ all_xmls = glob(os.path.join(root_path, r'*/*/*.xml')) random.shuffle(all_xmls) train_dir = '../../../data/BNC/train/' test_dir = '../../../data/BNC/test/' dev_dir = '../../../data/BNC/dev/' for directory in [train_dir, test_dir, dev_dir]: if os.path.isdir(directory): shutil.rmtree(directory) print(f'Creating directory {directory}') os.mkdir(directory) for i, full_path in tqdm(enumerate(all_xmls)): root, fileid = os.path.split(full_path) bnc_reader = BNCCorpusReader(root=root, fileids=fileid) filename, ext = os.path.splitext(fileid) if i % 10 == 9 or i % 10 == 3: save_dir = test_dir elif i % 10 == 6: save_dir = dev_dir else: save_dir = train_dir save_path = os.path.join(save_dir, filename + '.txt') with open(save_path, 'w') as f: f.write('\n'.join([' '.join(s) for s in bnc_reader.sents()]))
def preprocess(input_folder_path, output_folder_path, combined, mode, lowercase): if input_folder_path[-1] != '/': input_folder_path = input_folder_path + '/' if output_folder_path[-1] != '/': output_folder_path = output_folder_path + '/' if not (os.path.exists(output_folder_path) and os.path.isdir(output_folder_path)): os.mkdir(output_folder_path) file_list = [] if combined: output_file_path = output_folder_path + mode + '-corpus_preprocessed.txt' with open(output_file_path, 'w') as f: pass file_list = dirWalk(input_folder_path, output_file_path, file_list, combined) else: output_folder_path = output_folder_path + \ mode.capitalize() + '-corpus_preprocessed/' if not (os.path.exists(output_folder_path) and os.path.isdir(output_folder_path)): os.mkdir(output_folder_path) file_list = dirWalk(input_folder_path, output_folder_path, file_list, combined) for file_entry in file_list: root_path = file_entry[0] file_name = file_entry[1] output_file_path = file_entry[2] bncreader = BNCCorpusReader(root=root_path, fileids=file_name) words = bncreader.tagged_words(c5=True) if lowercase: data = "".join((str(word[0]).lower() + "_" + str(word[1]) + "\n") for word in words) else: data = "".join( (str(word[0]) + "_" + str(word[1]) + "\n") for word in words) # mwdata = mwpreprocess(root_path+file_name, lowercase) # data = data+mwdata if combined: with open(output_file_path, 'a') as f: f.write(data) else: with open(output_file_path, 'w') as f: f.write(data)
def read_bnc_subcorpus(name, regexp): bnc_reader = BNCCorpusReader(root='./BNC/texts/', fileids=regexp) words = [word.lower() for word in bnc_reader.words()] tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(" ".join(words)) csvfile = f'./words/{name}.csv' with open(csvfile, "w", encoding='utf-8') as output: writer = csv.writer(output, lineterminator='\n') for row in tokens: writer.writerow([row])
def bnc_words(args): root, fileids, c5, stem, omit_tags = args logger.debug('Processing %s', fileids) bnc = BNCCorpusReader(root=root, fileids=fileids) try: if not omit_tags: return Counter(bnc.tagged_words(stem=stem, c5=c5)) else: return Counter(bnc.words(stem=stem)) except: logger.error('Could not process %s', fileids) raise
def BNC2TXT(): bnc_reader = BNCCorpusReader(root=PATH_TO_BNC_TEXTS, fileids=r'[A-K]/\w*/\w*\.xml') tokenizer = RegexpTokenizer(r'\w+') # txt = bnc_reader.sents() #all the bnc corpus by sentances with open(NEW_BNX_TXT, 'w') as nf: i = 0 for s in bnc_reader.sents(): nf.write(' '.join(tokenizer.tokenize(s))) i = i + 1 if i % 100000 == 0: print('Joined {} Sentances , {}% Done'.format(i, i / 6026276)) pass
def bnc_cooccurrence(args): """Count word couccurrence in a BNC file.""" root, fileids, window_size, stem, targets, context = args logger.debug('Processing %s', fileids) cooccurences = count_cooccurrence( BNCCorpusReader(root=root, fileids=fileids).tagged_words(stem=stem), window_size=window_size, ) # It might be that case that targets are just words, not (word, POS) pairs. # In case this is the case, disregard the POS tags for targets. if not isinstance(targets.index[0], tuple): cooccurences = ((t[0], c, n) for t, c, n in cooccurences) counts = [(targets.loc[t].id, context.loc[c].id, n) for t, c, n in cooccurences if (t in targets.index) and (c in context.index)] if not counts: return Counter() counts = pd.DataFrame( counts, columns=('target', 'context', 'count'), ).groupby(('target', 'context'), ).sum() # TODO: it would be nice to return a DataFrame. # # Later, do_sum_counters could sum up data frames, instead of dicts. # Probably, it's not even needed to sum up counters across multiple processes. # Though, this needs benchmarking, for example on the SWDA targes. return Counter(dict(zip(counts.index, counts['count'])))
def bnc_vocabulary(root_path): """This process prepares the vocabulary file for BNC. Words occur less than or equal to 5 times are eliminated. There are 4049 xml's in BNC. Total processing time ~ 1 h 30 min """ word_counter = Counter() for full_path in tqdm(glob(os.path.join(root_path, r'*/*/*.xml'))): root, fileids = os.path.split(full_path) bnc_reader = BNCCorpusReader(root=root, fileids=fileids) words = bnc_reader.words() word_counter.update(words) common_words = word_counter.most_common() common_words_5 = list(filter(lambda x: x[1] > 5, common_words)) with open('../../../data/vocabulary/vocab_bnc_5.txt', 'w') as f: f.write('\n'.join([w[0] for w in common_words_5]))
def create_wordlist_from_subcorpus(name, regexp): bnc_reader = BNCCorpusReader(root='./BNC/texts/', fileids=regexp) words = [word.lower() for word in bnc_reader.words()] tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(" ".join(words)) fdist1 = FreqDist(tokens).items() sorted_fdist = sorted(fdist1, key=lambda item: item[1], reverse=True) csvfile = f'./exported/${name}.csv' with open(csvfile, "w", encoding='utf-8') as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(sorted_fdist)
def init_kwargs(cls, root=None, fileids=r'[A-K]/\w*/\w*\.xml'): if root is None: root = os.path.join(getcwd(), 'BNC', 'Texts') return dict( root=root, paths=BNCCorpusReader(root=root, fileids=fileids).fileids(), )
# -*- coding: utf-8 -*- from os import listdir from os.path import isfile, join import pdb import pickle from nltk.corpus.reader.bnc import BNCCorpusReader strPath = 'C:/mydesktop/BNC/download/Texts/' bnc_reader = BNCCorpusReader(root=strPath, fileids=r'[A-K]/\w*/\w*\.xml', lazy=False) listDir = listdir(strPath) #list_of_fileids=[] all_words = [] all_tagged_words = [] try: for strDir in listDir: # DIR [A-K] listSubDir = listdir(strPath + strDir) for strSubDir in listSubDir: # SUBDIR [A0-AY] #pdb.set_trace() listFile = listdir(strPath + strDir + '/' + strSubDir) for strFile in listFile: # FILES [A00.xml-A0Y.xml] print(strFile) strFileID = strDir + '/' + strSubDir + '/' + strFile words = bnc_reader.words(stem=True, fileids=strFileID) all_words.append(words) tagged_words = bnc_reader.tagged_words( stem=True, c5=True, fileids=strFileID) # C5 Tag all_tagged_words.append(tagged_words) #pdb.set_trace()
from nltk.stem.wordnet import WordNetLemmatizer from nltk.tag import pos_tag from nltk.corpus import stopwords from collections import Counter import os import csv # контроль по списку # кластеризовать слова w2v - матричные и зависимые stopw = set(stopwords.words('english')) with open('control_verbs.txt', 'r', encoding='utf-8') as f: control = {x.strip() for x in f.readlines()} lmzr = WordNetLemmatizer() r = BNCCorpusReader(root='../corpus/BNC/', fileids=r'B/\w*/\w*\.xml') tagged_sents = r.tagged_sents(c5=True) sents = r.sents() matrix = [] data = [] for tsent, sent in zip(tagged_sents[:50000], sents[:50000]): for i in range(1, len(tsent)): existence = tsent[i][1] is not None and tsent[i - 1][1] is not None now_bareinf = existence and tsent[i][1][0] == 'V' and tsent[i][1][ 2] == 'I' now_inf = existence and tsent[i][1] == 'TO0' now_ger = existence and tsent[i][1][0] == 'V' and tsent[i][1][2] == 'G' prev_matrix = existence and tsent[i - 1][1].startswith('VV') prev_lex = existence and tsent[i - 1][0].lower() not in stopw if prev_matrix and prev_lex: if now_ger:
Make sure that you set the bnc_reader (line 21) to the correct path to where the BNC is stored on your system. """ # We'll use the NLTK BNC reader. # Beware -- it's very slow! import nltk # We'll save the results using json import json # We're using the BNC, which is what Payne et. al (2013) use. from nltk.corpus.reader.bnc import BNCCorpusReader print('Loading BNC corpus') bnc_reader = BNCCorpusReader(root="/home/nick/nltk_data/corpora/bnc/Texts", \ fileids=r'[A-K]/\w*/\w*\.xml') # Write to this file: output_file = './sample_data.json' # Get some tagged sentences # The c5 tags provide more relevant information # than the default tags print('Preparing tagged sentences.') tagged_sentences = bnc_reader.tagged_sents(c5=True) # Count the sentences (since we'll need this number) print('Counting tagged sentences.') #tagged_sentences_count = 0 #for sentence in tagged_sentences: #tagged_sentences_count += 1
def bnc(self): """BNC corpus reader.""" root = self.kwargs['bnc'] return BNCCorpusReader(root=root, fileids=self.fileids)
# BNC-XML - XML - CHILDES - JSON ## BNC XML ## Read BNC XML import nltk from nltk.corpus.reader.bnc import BNCCorpusReader from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder # Instantiate the reader like this bnc_reader = BNCCorpusReader(root="../../../Corpus/BNC-XML/Texts/", fileids=r'[A-K]/\w*/\w*\.xml') list_of_fileids = ['A/A0/A00.xml', 'A/A0/A01.xml'] bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(bnc_reader.words(fileids=list_of_fileids)) scored = finder.score_ngrams(bigram_measures.raw_freq) print(scored)
def getContexts(lemmatization, path, wordWindow=1000): """ Returns a list of contexts (subdivided documents based on word window, default 1,000) of the British National Corpus. Parameters ---------- lemmatization : Boolean . path : str The directory of the folder where the BNC is saved. wordWindow : int The word length for which the BNC documents are subdivided. If not given, 1,000 is used as default. Returns ------- contextsList : list List of all the contexts in the corpus. contextInfo : str Id of the context. docInfo : list Id of the document. Notes ----- References ---------- """ # Set time start_time = time.clock() # Set BNC reader & parameters bnc_reader = BNCCorpusReader(root="Resources/Corpora/BNC/Texts", fileids=r'[A-K]\/w*\/w*/.xml') # Check if text is from written source tags = [ elem.tag for event, elem in ET.iterparse(path, events=("start", "end")) ] if "wtext" in tags: docID = path[33:-4] # Set time start_time = time.clock() # Read in a document as list of words docWordListRaw = bnc_reader.words(fileids=path[28:], strip_space=True, stem=lemmatization) # Preprocessing of raw text docWordList = TextPreProcessing(docWordListRaw) # Split document into contexts contextsList, contextInfo, docInfo = SplitDocuments( docWordList, docID, wordWindow) elif "stext" in tags: contextsList = "SPOKEN" contextInfo = "SPOKEN" docInfo = "SPOKEN" else: contextsList = "NEITHER" contextInfo = "NEITHER" docInfo = "NEITHER" # Print out status t = time.clock() print('t: ', t / 60, end='\t') print(t - start_time, "multiprocessor seconds") return (contextsList, contextInfo, docInfo)