def _doi2txt_fname(text_dir): """ Create a dict mapping DOI to path of input text file """ doi2txt = {} for p in Path(text_dir).files(): doi = get_doi(p) if doi in doi2txt: log.error('DOI {} already mapped to text file {}; ' 'ignoring text file {}'.format(doi, doi2txt[doi], p)) else: doi2txt[doi] = p return doi2txt
def vars_to_csv(vars_dir, scnlp_dir, text_dir, nodes_csv_dir, relation_csv_dir, max_n=None): """ Transform extracted variables to csv tables that can be imported by neo4j Parameters ---------- vars_dir : str directory containing files with extracted variables in json format scnlp_dir : str directory containing files with scnlp output in xml format text_dir : str directory containing files with original text or sentences (one per line) bib_dir : str directory containing files with BibTex entries (one per file) nodes_csv_dir : str output directory for nodes csv files relation_csv_dir : str output directory for relationships csv files max_n: int or None process max_n variable files Notes ----- See http://neo4j.com/docs/stable/import-tool-header-format.html """ # TODO: change article nodes to document # hold on to open files open_files = [] def create_csv_file(dir, csv_fname, header=(':START_ID', ':END_ID', ':TYPE')): csv_fname = Path(dir) / csv_fname log.info('creating ' + csv_fname) f = open(csv_fname, 'w', newline='') open_files.append(f) csv_file = csv.writer(f, quoting=csv.QUOTE_MINIMAL) csv_file.writerow(header) return csv_file Path(nodes_csv_dir).makedirs_p() Path(relation_csv_dir).makedirs_p() # create csv files for nodes articles_csv = create_csv_file(nodes_csv_dir, 'articles.csv', ('doi:ID', 'filename', ':LABEL')) sentences_csv = create_csv_file(nodes_csv_dir, 'sentences.csv', ('sentID:ID', 'treeNumber:int', 'charOffsetBegin:int', 'charOffsetEnd:int', 'sentChars', ':LABEL')) variables_csv = create_csv_file(nodes_csv_dir, 'variables.csv', ('subStr:ID', ':LABEL')) events_csv = create_csv_file(nodes_csv_dir, 'events.csv', ('eventID:ID', 'filename', 'nodeNumber:int', 'extractName', 'charOffsetBegin:int', 'charOffsetEnd:int', 'direction', ':LABEL')) # create csv files for relations has_sent_csv = create_csv_file(relation_csv_dir, 'has_sent.csv') has_var_csv = create_csv_file(relation_csv_dir, 'has_var.csv') has_event_csv = create_csv_file(relation_csv_dir, 'has_event.csv') tentails_var_csv = create_csv_file(relation_csv_dir, 'tentails_var.csv', (':START_ID', ':END_ID', 'transformName', ':TYPE')) # set of all variable types in text collection variable_types = set() # mapping from DOI to text files doi2txt = _doi2txt_fname(text_dir) for json_fname in Path(vars_dir).files('*.json')[:max_n]: records = json.load(open(json_fname)) if not records: log.warning('skipping empty variables file: ' + json_fname) continue log.info('processing variables from file: ' + json_fname) doi = get_doi(json_fname) try: text_fname = doi2txt[doi] except KeyError: log.error('no matching text file for DOI ' + doi) continue text = open(text_fname).read() # read corenlp analysis tree_fname = records[0]['filename'] scnlp_fname = derive_path(tree_fname, new_dir=scnlp_dir, new_ext='xml') xml_tree = etree.parse(scnlp_fname) sentences_elem = xml_tree.getroot()[0][0] # create article node articles_csv.writerow((doi, text_fname, 'Article')) tree_number = None # mapping of record's "key" to "subStr" attribute, # needed for TENTAILS_VAR relation key2var = {} for rec in records: if rec['treeNumber'] != tree_number: # moving to new tree tree_number = rec['treeNumber'] sent_id = '{}/{}'.format(doi, tree_number) # get char offsets for sentence (tree numbers start at 1) sent_elem = sentences_elem[int(tree_number) - 1] begin = int(sent_elem[0][0][2].text) end = int(sent_elem[0][-1][3].text) sent_chars = text[begin:end] sentences_csv.writerow((sent_id, tree_number, begin, end, sent_chars, 'Sentence')) has_sent_csv.writerow((doi, sent_id, 'HAS_SENT')) key2var[rec['key']] = var_type = rec['subStr'] if var_type not in variable_types: variables_csv.writerow((var_type, 'VariableType')) variable_types.add(var_type) # FIXME weak method of detecting preprocessing if ('transformName' in rec and not rec['transformName'].startswith('PreProc')): # variable is transformed, but not by preprocessing, # so it is tentailed ancestor_var_type = key2var[rec['ancestor']] tentails_var_csv.writerow((ancestor_var_type, var_type, rec['transformName'], 'TENTAILS_VAR')) else: # observed event event_id = rec['key'] event_labels = 'EventInst;' + rec['label'].capitalize() + 'Inst' events_csv.writerow((event_id, tree_fname, rec['nodeNumber'], rec['extractName'], rec['charOffsetBegin'], rec['charOffsetEnd'], rec['label'], event_labels)) has_event_csv.writerow((sent_id, event_id, 'HAS_EVENT')) has_var_csv.writerow((event_id, var_type, 'HAS_VAR')) # release opened files for f in open_files: f.close()