class TestDocument(unittest.TestCase): def setUp(self): with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read()) def test_sentiment(self): self.assertIsNone(self._document._sentiment, "Sentiment should be lazy-loaded") expected = 1.2173913043478262 self.assertEquals(expected, self._document.sentiment, "Sentiment should be returned for public property") self.assertIsNotNone(self._document._sentiment, "Sentiment should be memoized") self.assertEquals(expected, self._document._sentiment, "Sentiment should be memoized") def test_sentences(self): self.assertIsNone(self._document._sentences_dict, "Sentences should be lazy-loaded") sentences = self._document._get_sentences_dict().values() self.assertIsNotNone(self._document._sentences_dict, "Sentences should be memoized") self.assertGreater(len(sentences), 0, "We should have sentences") for sentence in sentences: self.assertIsInstance(sentence, Sentence, "Sentences should be a list of only sentences") self.assertEquals(self._document.sentences, sentences, "Sentences property should work") self.assertIsInstance(self._document._sentences_dict, OrderedDict, "Protected sentences should be ordered") def test_get_sentence_by_id(self): sentence = self._document.get_sentence_by_id(1) self.assertIsInstance(sentence, Sentence, "Should return a Sentence instance") self.assertEquals(sentence.id, 1, "Sentence returned should have the appropriate ID") self.assertIsNone(self._document.get_sentence_by_id(-1), "If the ID doesn't exist, we should get None")
def convert(self, id, result, format): assert format in ["csv"] try: doc = Document(result.encode("utf-8")) except: logging.exception("Error on parsing xml") raise s = StringIO() w = csv.writer(s) w.writerow(["doc_id", "sentence", "token_id", "offset", "token", "lemma", "POS", "pos1", "NER", "relation", "parent"]) parents = {} # sentence, child.id : (rel, parent.id) for sent in doc.sentences: if sent.collapsed_ccprocessed_dependencies: for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': parents[sent.id, dep.dependent.idx] = (dep.type, dep.governor.idx) for sent in doc.sentences: for t in sent.tokens: rel, parent = parents.get((sent.id, t.id), (None, None)) w.writerow([id, sent.id, t.id, t.character_offset_begin, t.word, t.lemma, t.pos, POSMAP[t.pos], t.ner, rel, parent]) return s.getvalue()
def stanford_to_saf(xml_bytes): doc = Document(xml_bytes) saf = collections.defaultdict(list) saf['header'] = { 'format': "SAF", 'format-version': "0.0", 'processed': { 'module': "corenlp", 'module-version': _CORENLP_VERSION, "started": datetime.now().isoformat() } } tokens = {} # (xml_sentid, xml_tokenid) : saf_tokenid def tokenid(sentid, tokenid): if (sentid, tokenid) in tokens: raise ValueError( "Duplicate tokenid: {sentid}, {tokenid}".format(**locals())) saf_tokenid = len(tokens) + 1 tokens[sentid, tokenid] = saf_tokenid return saf_tokenid for sent in doc.sentences: saf['tokens'] += [ dict(id=tokenid(sent.id, t.id), sentence=sent.id, offset=t.character_offset_begin, lemma=t.lemma, word=t.word, pos=t.pos, pos1=_POSMAP[t.pos]) for t in sent.tokens ] saf['entities'] += [{ 'tokens': [tokens[sent.id, t.id]], 'type': t.ner } for t in sent.tokens if t.ner not in (None, 'O')] if sent.collapsed_ccprocessed_dependencies: links = sent.collapsed_ccprocessed_dependencies.links saf['dependencies'] += [{ 'child': tokens[sent.id, dep.dependent.idx], 'parent': tokens[sent.id, dep.governor.idx], 'relation': dep.type } for dep in links if dep.type != 'root'] if doc.coreferences: saf['coreferences'] = [[[ tokens[m.sentence.id, t.id] for t in m.tokens ] for m in coref.mentions] for coref in doc.coreferences] saf['trees'] = [{ 'sentence': s.id, 'tree': s.parse_string.strip() } for s in doc.sentences if s.parse_string is not None] # remove default and empty elements return {k: v for (k, v) in iteritems(saf) if v != []}
def get_document_by_id(doc_id): """ Accesses parsed XML from S3 :param doc_id: the document ID :type doc_id: str :return: A document object wrapping CoreNLP's XML :rtype: corenlp_xml.document.Document """ service_response = ParsedXmlService().get(doc_id) document = None if service_response.get('status') == 200: try: document = Document(service_response[doc_id]) except XMLSyntaxError: document = Document('<xml/>') return document
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= { terms[m.sentence.id, t.id].get_id() for t in m.tokens } naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor( LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def build_tree(text, drugs): """passes text to tree constructor for use by parse_tree(). Should only be used in cases of sentences containing multiple drug references. ARGS: text: string. text body from comment drugs: list. list of strings, containing mentioned drugs. RETURNS: trees: list. list of nltk.tree.Tree objects, one for each sentence. sentiments: list. list of strings, containing the sentiment rating from coreNLP for each sentence. """ with open('input.txt', 'w') as writefile: writefile.write(text.encode('utf8')) with open('corenlp.log', 'w') as logfile: subprocess.call( "/home/jrwalk/corenlp/corenlp.sh -annotators " "tokenize,ssplit,parse,pos,sentiment " "-file input.txt -outputFormat xml", shell=True, stdout=logfile) xmlstring = '' with open('input.txt.xml', 'r') as readfile: for line in readfile: xmlstring += line os.remove('input.txt') os.remove('input.txt.xml') os.remove('corenlp.log') doc = Document(xmlstring) sentences = doc.sentences trees = [] sentiments = [] for sent in sentences: sentiments.append(sent.sentiment) tree = nltk.tree.Tree.fromstring(sent.parse_string) trees.append(tree) return (trees, sentiments)
def convert(self, id, result, format): assert format in ["csv"] try: doc = Document(result.encode("utf-8")) except: logging.exception("Error on parsing xml") raise s = StringIO() w = csv.writer(s) w.writerow(["id", "sentence", "offset", "word", "lemma", "POS", "pos1", "ner"]) for sent in doc.sentences: for t in sent.tokens: w.writerow([id, sent.id, t.character_offset_begin, t.word, t.lemma, t.pos, POSMAP[t.pos], t.ner]) return s.getvalue()
def phrases_for_wiki_field(wiki_id, field): """ Find all noun phrases in a locally-cached parse of a particular Solr field for a wiki. :type wiki_id: str|int :param wiki_id: The wiki ID to extract NPs from :type field: str :param field: The name of the Solr field to extract NPs from :rtype: list :return: A list of NPs in the given field for the wiki specified """ path = '/data/wiki_xml/%s/%s.xml' % (wiki_id, field) if not os.path.exists(path): return [] text = open(path, 'r').read() if len(text) > 0: document = Document(text) return get_pos_leaves(document, NOUN_TAGS) return []
def setUp(self): with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read()) self._graph = self._document.sentences[0].basic_dependencies self._link = self._graph.links_by_type('root')[0] self._node = self._link.governor
def setUp(self): with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read()) self._graph = self._document.sentences[0].basic_dependencies
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, save = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, do_concordancing = False, maxconc = 9999, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" only_conc = False no_conc = False if do_concordancing is False: no_conc = True if type(do_concordancing) == str and do_concordancing.lower() == 'only': only_conc = True no_conc = False # iteratively count conc lines numconc = 0 # store kwargs locs = locals() if kwargs: for k, v in kwargs.items(): locs[k] = v locs.pop('kwargs', None) import corpkit from interrogation import Interrogation from corpus import Datalist, Corpora, Corpus, File from process import tregex_engine, get_deps import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from time import localtime, strftime from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs import signal original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: original_sigint = signal.getsignal(signal.SIGINT) def signal_handler(signal, frame): """pause on ctrl+c, rather than just stop loop""" import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) try: sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) except NameError: sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler) # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if corpus.__class__ not in [Corpus, Corpora, File]: if not multiprocess and not kwargs.get('outname'): corpus = Corpus(corpus, print_info = False) # figure out how the user has entered the query and normalise from process import searchfixer search = searchfixer(search, query) if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict #if hasattr(corpus, '__iter__'): # im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" speakr = dummy_args.get('speaker', False) import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: encd = to_write.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) q = list(search.values())[0] ops = ['-o', '-%s' % translated_option] concs = [] res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) if not no_conc: ops += ['-w', '-f'] whole_res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) res = format_tregex(res) whole_res = format_tregex(whole_res, whole = True) concs = make_conc_lines_from_whole_mid(whole_res, res, speakr) if root: root.update() try: os.remove(to_open) except OSError: pass if countmode: return(len(res)) else: return res, concs def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, [] def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): import re, os if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results, whole = False): """format tregex by show list""" if countmode: return results import re done = [] if whole: fnames = [x for x, y in results] results = [y for x, y in results] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) if whole: done = zip(fnames, done) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) if hasattr(corpus, '__iter__') and im: corpus = Corpus(corpus) if hasattr(corpus, '__iter__') and not im: im = True if corpus.__class__ == Corpora: im = True if not im and multiprocess: im = True corpus = corpus[:] # if it's already been through pmultiquery, don't do it again locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess locs['print_info'] = kwargs.get('printstatus', True) if im: signal.signal(signal.SIGINT, original_sigint) from multiprocess import pmultiquery return pmultiquery(**locs) cname = corpus.name subcorpora = corpus.subcorpora try: datatype = corpus.datatype singlefile = corpus.singlefile except AttributeError: datatype = 'parse' singlefile = False # store all results in here results = {} count_results = {} conc_results = {} # check if just counting countmode = 'c' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l'] if datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 no_conc = True only_conc = False do_concordancing = False else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): translated_option = 't' query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.__class__ == Datalist: to_iterate_over = {} for subcorpus in corpus: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files elif singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for subcorpus in subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): # to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n %s corpus ... \n' % \ (thetime, message, cname, optiontext, sformat, message) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): conc_results[subcorpus_name] = [] count_results[subcorpus_name] = [] results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not countmode: result = format_tregex(result) if not no_conc: op += ['-w', '-f'] whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result, whole = True) conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if countmode: count_results[subcorpus_name] += [result] else: result = Counter(result) results[subcorpus_name] += result if not no_conc: for lin in conc_result: if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: slow_treg_speaker_guess = kwargs.get('outname', False) if datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if len(just_speakers) == 1: slow_treg_speaker_guess = just_speakers[0] if not sents: continue else: sents = corenlp_xml.sentences res, conc_res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, do_concordancing = do_concordancing, only_format_match = only_format_match, speaker = slow_treg_speaker_guess) if res == 'Bad query': return 'Bad query' elif datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) if not only_conc: res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') elif datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() if not only_conc: res = searcher(list(search.values())[0], data, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for index, line in enumerate(conc_res): if searcher != slow_tregex: line.insert(0, f.name) else: line[0] = f.name if not preserve_case: line[3:] = [x.lower() for x in line[3:]] if spelling: line = [correct_spelling(b) for b in line] if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: if not preserve_case: if not statsmode: res = [i.lower() for i in res] if spelling: if not statsmode: res = [correct_spelling(r) for r in res] #if not statsmode: results[subcorpus_name] += Counter(res) #else: #results[subcorpus_name] += res if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if not no_conc: all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) conc_df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(conc_df['s'].values)): conc_df.drop('s', axis = 1, inplace = True) #if kwargs.get('note'): # kwargs['note'].progvar.set(100) #if kwargs.get('printstatus', True): # thetime = strftime("%H:%M:%S", localtime()) # finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index)) # print(finalstring) from interrogation import Concordance output = Concordance(conc_df) if only_conc: output.query = locs if save: output.save(save) if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df)) print(finalstring) signal.signal(signal.SIGINT, original_sigint) return output #output.query = locs #return output ############################################ # Get interrogation into DataFrame # ############################################ if not only_conc: if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort_values(ascending = False, inplace = True) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) if not no_conc: interro = Interrogation(results = df, totals = tot, query = locs, concordance = output) else: interro = Interrogation(results = df, totals = tot, query = locs) if save: interro.save(save) signal.signal(signal.SIGINT, original_sigint) return interro
def setUp(self): """ It would probably be a good idea to look into Mock, eventually """ with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read()) self._sentence = self._document.sentences[0]
def setUp(self): with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read())
def setUp(self): with open("test.xml", "r") as xml_file: self._document = Document(xml_file.read()) self._coref = self._document.coreferences[0] self._mention = self._coref.mentions[0]
def document(self): """Return the parsed XML of a parsed file""" from corenlp_xml.document import Document return Document(self.read())
from corenlp_xml.document import Document with open('/data/wiki_xml/831/description_txt.xml') as f: xml = f.read() doc = Document(xml) print doc for sentence in doc.sentences: print[token.word for token in sentence.tokens]