def get_text(session, core, doi, fields, hash_tags, resume, solr_url, text_dir): if doi.startswith('http://dx.doi.org/'): doi = doi[18:] quoted_doi = quote_doi(doi) text_path = derive_path('', new_dir=text_dir, new_corename=quoted_doi, new_ext='txt', append_tags=hash_tags) if resume and text_path.exists(): log.info('skipping file {!r} because it exists'.format(text_path)) else: doc = query_solr(session, solr_url, core, doi, fields) values = [] for key in fields: try: val = doc[key] except KeyError as key: log.error('no {!r} field for doi {!r} in core {!r}'.format(key.args[0], doi, core)) return # flatten list values, e.g. for fulltext if isinstance(val, list): values.append('\n'.join(val)) else: values.append(val) text = '\n\n'.join(values) log.info('creating text file {!r}'.format(text_path)) text_path.write_text(text)
def tag_var_nodes(vars_dir, trees_dir, tagged_dir): """ Tag variable nodes in tree Tag variables nodes in trees with "_VAR:f:n:m:e" suffix where f is the name of the parse file, n is the tree number, m is the variable's node number and e is name of the pattern used for extracting this variable. Will only output those trees containing at least two variables. """ # At first I used the tregex's '-f' option to print the filename, # but when traversing the files in a directory, # it prints the wrong filenames (after the first one?), # so now the filename is encoded in the node label too. tagged_dir = Path(tagged_dir) tagged_dir.makedirs_p() for vars_fname in Path(vars_dir).glob('*.json'): d = defaultdict(list) # create a dict mapping each tree number to a list of # (nodeNumber, extractName) tuples for its variables for record in json.load(vars_fname.open()): pair = record['nodeNumber'], record['key'] d[record['treeNumber']].append(pair) lemtree_fname = record['filename'] parses = (Path(trees_dir) / lemtree_fname).lines() tagged_parses = [] for tree_number, pairs in d.items(): if len(pairs) > 1: # tree numbers in records count from one tree = Tree.fromstring(parses[tree_number - 1]) # get NLTK-style indices for all nodes in a preorder # traversal of the tree positions = tree.treepositions() vars_count = 0 for node_number, key in pairs: # node numbers in records count from one position = positions[node_number - 1] subtree = tree[position] try: subtree.set_label( '{}_VAR_{}'.format(subtree.label(), key)) except AttributeError: log.error('skipping variable "{}" because it is a leaf ' 'node ({})'.format(subtree, key)) else: vars_count += 1 if vars_count > 1: tagged_parses.append(tree.pformat(margin=99999)) if tagged_parses: tagged_fname = derive_path(lemtree_fname, new_dir=tagged_dir) log.info('writing tagged trees to ' + tagged_fname) tagged_fname.write_lines(tagged_parses)
def get_text(session, core, doi, fields, hash_tags, resume, solr_url, text_dir): if doi.startswith('http://dx.doi.org/'): doi = doi[18:] quoted_doi = quote_doi(doi) text_path = derive_path('', new_dir=text_dir, new_corename=quoted_doi, new_ext='txt', append_tags=hash_tags) if resume and text_path.exists(): log.info('skipping file {!r} because it exists'.format(text_path)) else: doc = query_solr(session, solr_url, core, doi, fields) values = [] for key in fields: try: val = doc[key] except KeyError as key: log.error('no {!r} field for doi {!r} in core {!r}'.format( key.args[0], doi, core)) return # flatten list values, e.g. for fulltext if isinstance(val, list): values.append('\n'.join(val)) else: values.append(val) text = '\n\n'.join(values) log.info('creating text file {!r}'.format(text_path)) text_path.write_text(text)
def write_relations(rel_records, rels_dir): """ write extracted relations per file as json records """ rels_dir = Path(rels_dir) rels_dir.makedirs_p() for fname, rec_list in rel_records.items(): rels_fname = derive_path(fname, new_dir=rels_dir, append_tags='rels', new_ext='json') log.info('writing extracted relations to ' + rels_fname) json.dump(rec_list, rels_fname.open('w'), indent=0)
def preproc_vars(trans_exec, trans_fname, in_vars_dir, out_vars_dir, tmp_dir=None, resume=RESUME_PREP): """ Preprocess variables Deletes determiners (DT), personal/possessive pronouns (PRP or PRP$) and list item markers (LS or LST). """ # TODO: resume only works if tmp_dir is given if not tmp_dir: tmp = TemporaryDirectory() tmp_dir = tmp.name parts = [trans_exec, '--tag "#prep"'] if resume: parts.append('--resume') parts += [in_vars_dir, tmp_dir, trans_fname] cmd = ' '.join(parts) log.info('\n' + cmd) # universal_newlines=True is passed so the return value will be a string # rather than bytes ret = check_output(cmd, shell=True, universal_newlines=True) log.info('\n' + ret) Path(out_vars_dir).makedirs_p() for in_vars_fname in Path(tmp_dir).files(): out_vars_fname = derive_path(in_vars_fname, new_dir=out_vars_dir) if resume and out_vars_fname.exists(): log.info('skipping existing preprocessed file ' + out_vars_fname) continue records = json.load(open(in_vars_fname)) # Remove any var that has descendents # (i.e. from which a node was deleted) # Also remove empty vars or "NP" vars out_vars_records = [rec for rec in records if rec['subStr'] not in ['','NP'] and not 'descendants' in rec] if out_vars_fname: log.info('writing to preprocessed variable file ' + out_vars_fname) json.dump(out_vars_records, out_vars_fname.open('w'), indent=0) else: log.info('skipping empty preprocessed variable file ' + out_vars_fname)
def extract_parse_trees(scnlp_files, parse_dir): """ extract parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for parse_elem in nlp_doc.findall(".//parse"): parse_file.write(parse_elem.text + "\n")
def extract_lemmatized_parse_trees(scnlp_files, parse_dir): """ extract lemmatzied parse trees (PTB labeled bracket structures) from Stanford CoreNLP XML ouput """ make_dir(parse_dir) for scnlp_fname in file_list(scnlp_files, "*.xml"): nlp_doc = ElementTree(file=scnlp_fname) parse_fname = derive_path(scnlp_fname, new_dir=parse_dir, new_ext='.parse') log.info("writing " + parse_fname) with open(parse_fname, "wt", encoding="utf-8") as parse_file: for sentence_elem in nlp_doc.iterfind(".//sentence"): lemmas = sentence_elem.iterfind("tokens/token/lemma") word_parse = sentence_elem.find("parse").text.strip() lemma_parse = " ".join(_lemmatized_node(node, lemmas) for node in word_parse.split(" ")) parse_file.write(lemma_parse + "\n")
def core_nlp(input, out_dir=OUT_DIR, annotators=ANNOTATORS, class_path=CLASS_PATH, version=VERSION, memory=MEMORY, threads=THREADS, replace_ext=REPLACE_EXT, output_ext=OUTPUT_EXT, options=OPTIONS, stamp=STAMP, resume=RESUME, use_sr_parser=USE_SR_PARSER): """ Run Stanford CoreNLP Parameters ---------- input out_dir annotators class_path version memory threads replace_ext output_ext options stamp resume use_sr_parser Returns ------- """ in_files = file_list(input) make_dir(out_dir) cmd = ['java'] if memory: cmd.append('-Xmx' + memory) if class_path: class_path = '"{}"'.format(join(class_path or '.', "*")) cmd.append('-cp ' + class_path) cmd.append('edu.stanford.nlp.pipeline.StanfordCoreNLP') if annotators: cmd.append('-annotators ' + annotators) if stamp: replace_ext = True output_ext = '#scnlp_v{}{}'.format(version or '', output_ext) if replace_ext: cmd.append('-replaceExtension') if output_ext: cmd.append('-outputExtension "{}"'.format(output_ext)) if out_dir: cmd.append('-outputDirectory ' + out_dir) if threads: cmd.append('-threads {}'.format(threads)) if resume: in_files = [fname for fname in in_files if not derive_path(fname, new_dir=out_dir, new_ext=output_ext).exists()] if options: cmd.append(options) if 'parse' in annotators and use_sr_parser: cmd.append( '-parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz') # create a temporary file with input filenames tmp_file = NamedTemporaryFile("wt", buffering=1) tmp_file.write('\n'.join(in_files) + "\n") cmd.append('-filelist ' + tmp_file.name) cmd = ' '.join(cmd) log.info('\n' + cmd) ret = check_output(cmd, shell=True, universal_newlines=True) log.info('\n' + ret) return ret
def vars_to_csv(vars_dir, scnlp_dir, text_dir, nodes_csv_dir, relation_csv_dir, max_n=None): """ Transform extracted variables to csv tables that can be imported by neo4j Parameters ---------- vars_dir : str directory containing files with extracted variables in json format scnlp_dir : str directory containing files with scnlp output in xml format text_dir : str directory containing files with original text or sentences (one per line) bib_dir : str directory containing files with BibTex entries (one per file) nodes_csv_dir : str output directory for nodes csv files relation_csv_dir : str output directory for relationships csv files max_n: int or None process max_n variable files Notes ----- See http://neo4j.com/docs/stable/import-tool-header-format.html """ # TODO: change article nodes to document # hold on to open files open_files = [] def create_csv_file(dir, csv_fname, header=(':START_ID', ':END_ID', ':TYPE')): csv_fname = Path(dir) / csv_fname log.info('creating ' + csv_fname) f = open(csv_fname, 'w', newline='') open_files.append(f) csv_file = csv.writer(f, quoting=csv.QUOTE_MINIMAL) csv_file.writerow(header) return csv_file Path(nodes_csv_dir).makedirs_p() Path(relation_csv_dir).makedirs_p() # create csv files for nodes articles_csv = create_csv_file(nodes_csv_dir, 'articles.csv', ('doi:ID', 'filename', ':LABEL')) sentences_csv = create_csv_file(nodes_csv_dir, 'sentences.csv', ('sentID:ID', 'treeNumber:int', 'charOffsetBegin:int', 'charOffsetEnd:int', 'sentChars', ':LABEL')) variables_csv = create_csv_file(nodes_csv_dir, 'variables.csv', ('subStr:ID', ':LABEL')) events_csv = create_csv_file(nodes_csv_dir, 'events.csv', ('eventID:ID', 'filename', 'nodeNumber:int', 'extractName', 'charOffsetBegin:int', 'charOffsetEnd:int', 'direction', ':LABEL')) # create csv files for relations has_sent_csv = create_csv_file(relation_csv_dir, 'has_sent.csv') has_var_csv = create_csv_file(relation_csv_dir, 'has_var.csv') has_event_csv = create_csv_file(relation_csv_dir, 'has_event.csv') tentails_var_csv = create_csv_file(relation_csv_dir, 'tentails_var.csv', (':START_ID', ':END_ID', 'transformName', ':TYPE')) # set of all variable types in text collection variable_types = set() # mapping from DOI to text files doi2txt = _doi2txt_fname(text_dir) for json_fname in Path(vars_dir).files('*.json')[:max_n]: records = json.load(open(json_fname)) if not records: log.warning('skipping empty variables file: ' + json_fname) continue log.info('processing variables from file: ' + json_fname) doi = get_doi(json_fname) try: text_fname = doi2txt[doi] except KeyError: log.error('no matching text file for DOI ' + doi) continue text = open(text_fname).read() # read corenlp analysis tree_fname = records[0]['filename'] scnlp_fname = derive_path(tree_fname, new_dir=scnlp_dir, new_ext='xml') xml_tree = etree.parse(scnlp_fname) sentences_elem = xml_tree.getroot()[0][0] # create article node articles_csv.writerow((doi, text_fname, 'Article')) tree_number = None # mapping of record's "key" to "subStr" attribute, # needed for TENTAILS_VAR relation key2var = {} for rec in records: if rec['treeNumber'] != tree_number: # moving to new tree tree_number = rec['treeNumber'] sent_id = '{}/{}'.format(doi, tree_number) # get char offsets for sentence (tree numbers start at 1) sent_elem = sentences_elem[int(tree_number) - 1] begin = int(sent_elem[0][0][2].text) end = int(sent_elem[0][-1][3].text) sent_chars = text[begin:end] sentences_csv.writerow((sent_id, tree_number, begin, end, sent_chars, 'Sentence')) has_sent_csv.writerow((doi, sent_id, 'HAS_SENT')) key2var[rec['key']] = var_type = rec['subStr'] if var_type not in variable_types: variables_csv.writerow((var_type, 'VariableType')) variable_types.add(var_type) # FIXME weak method of detecting preprocessing if ('transformName' in rec and not rec['transformName'].startswith('PreProc')): # variable is transformed, but not by preprocessing, # so it is tentailed ancestor_var_type = key2var[rec['ancestor']] tentails_var_csv.writerow((ancestor_var_type, var_type, rec['transformName'], 'TENTAILS_VAR')) else: # observed event event_id = rec['key'] event_labels = 'EventInst;' + rec['label'].capitalize() + 'Inst' events_csv.writerow((event_id, tree_fname, rec['nodeNumber'], rec['extractName'], rec['charOffsetBegin'], rec['charOffsetEnd'], rec['label'], event_labels)) has_event_csv.writerow((sent_id, event_id, 'HAS_EVENT')) has_var_csv.writerow((event_id, var_type, 'HAS_VAR')) # release opened files for f in open_files: f.close()