Exemplo n.º 1
0
def main(config):
    """Match files in target directory with nodes containing file link
    config fields: content_index, content_base_name, content_file_dir
    output: html or csv
    """
    try:
        c = load_config(config)
    except:
        exit()
    cpath = Path(config).with_suffix('.yaml')
    if not cpath.exists():
        print('no config file found')
        exit()
    cf = yaml.load(cpath.read_text())

    ct = CherryTree(cf['content_index'])

    cbn = cf['content_base_name']
    cfd = cf['content_file_dir']

    content_base_node = ct.find_node_by_name(cbn)

    if not content_base_node:
        print(f' {content_base_name} not in index')
        exit()

    for node in content_base_node.descendants:
        for item in node.text:
            print(item)
def main(content_index, config_path):
    try:
        config = load_config(config_path)
    except Exception as e:
        print(e)
        exit()

    ct = CherryTree(content_index)
    cbn = config['content_base_name']
    content_dir = config['content_file_dir']
    input_defaults = config['input_args']
    output_defaults = config.get('output_args', {})

    try:
        content_base_node = ct.find_node_by_name(cbn)
    except KeyError:
        print(f'{cbn} not in index')

    input_files = [str(l.filepath) \
                   for n in content_base_node.descendants \
                   for l in  n.links \
                   if content_dir in str(l.filepath)
                   ]

    print(combine_files(input_defaults, *input_files, **output_defaults))
Exemplo n.º 3
0
 def __init__(self, index_file=None, base_node=None, text_dir=None):
     print('index', index_file)
     self.index_file = index_file
     self.base_node = base_node
     self.text_dir = text_dir
     try:
         self.ct = CherryTree(index_file)
     except Exception as e:
         print(e)
         raise
Exemplo n.º 4
0
def extract_named_entities(text_index, target_node):
    ct = CherryTree(text_index)
    if target_node:
        node = ct.find(target_node)
    else:
        node = ct.root

    documents = [(read_document(n.textfile), n) for n in node.subnodes()]

    docs = nlp.pipe([(t.content, t) for t in documents], as_tuples=True)

    for (node, ent) in [(d[1], e) for for d in docs for e in d[0].ents]:
        yield (node, ent)
Exemplo n.º 5
0
def main(config, output='csv'):
    """Match files in target directory with nodes containing file link
    config fields: content_index, content_base_name, content_file_dir
    output: html or csv
    """
    try:
        c = load_config(config)
    except:
        exit()
    cpath = Path(config).with_suffix('.yaml')
    if not cpath.exists():
        print('no config file found')
        exit()
    cf = yaml.load(cpath.read_text())

    ct = CherryTree(cf['content_index'])

    cbn = cf['content_base_name']
    cfd = cf['content_file_dir']

    content_base_node = ct.find_node_by_name(cbn)

    if not content_base_node:
        print(f' {content_base_name} not in index')
        exit()

    file_links = set([l.filepath.stem \
                    for n in content_base_node.descendants \
                    for l in n.links \
                    if l.filepath
                    if cfd in str(l.filepath)
                    ])

    file_paths = set([f.stem \
        for f in Path(cfd).iterdir() \
        if f.suffix == '.md'])

    unmatched = [(f, None) for f in file_paths.difference(file_links)]
    unmatched.extend([(None, l) for l in file_links.difference(file_paths)])

    df = pd.DataFrame(unmatched,
                      columns=['File', 'Link']).drop_duplicates().sort_values(
                          ['Link', 'File'])

    if output == 'csv':
        print(df.to_csv(sep=' ', index=False, header=False))

    elif output == 'html':
        print(df.to_html())
    else:
        print(f'unknown format {output}')
Exemplo n.º 6
0
def main(config, category, output='html'):
    """Match anchors to targets in content branch of index
    config fields: content_index, content_base_name,
    output: html or csv
    """
    def category_nodes(names):
        for name in names:
            node = ct.find_node_by_name(name)
            if node:
                yield node
            else:
                raise f'category {name} not found'

    def make_output_item(name):
        anc = next((a for a in anchors if a.name == name), None)
        if anc:
            return (anc.node.name, anc.name)
        return None

    def find_unlinked_anchors(anchors):
        for node in set((a.name for a in anchors)).difference(set((l.node_anchor for l in links))):
            yield node

    cpath = Path(config).with_suffix('.yaml')
    if not cpath.exists():
        print('no config file found')
        exit()
    cf = yaml.load(cpath.read_text())

    ct = CherryTree(config['content_index'])

    content_base_node = ct.find_node_by_name(config['content_base_name'])
    if not content_base_node:
        raise f'base node {config['content_base_name']} not found'

    anchor_links = [l.node_anchor for n in link_target_node.descendants for l in n.links if l.node_anchor]

    unlinked_anchors = [(c.name, n.name, a.name) \
                        for c in category_nodes(config['categories']) \
                        for n in c.descendants \
                        for a in n.anchors \
                        if not a.name in anchor_links]

    df = pd.DataFrame(unlinked_anchors, columns=['Category', 'Node', 'Anchor'])

    report_path = Path(config['report_folder'])

    for category in df.Category.unique():
        convert_text(df[df.Category == category].sort_values('Node').to_html(),
                        report_path.joinpath(f'{snake_case(category)}_unlinked_anchors').with_suffix('.html'))
    print('finished')
Exemplo n.º 7
0
def main(config_path):

    cpath = Path(config_path)
    if not cpath.exists():
        print('no config found')
        exit()
    config = yaml.load(cpath.read_text())

    ct = CherryTree(config['source_index'])

    link_target_node = ct.find_node_by_name(config['target_name'])
    if not link_target_node:
        print(f'target node {target_name} not found')
        exit()

    output_args = config['output_args']
    for filepath in [
            str(l.filepath) for n in link_target_node.descendants
            for l in n.links if l.filepath
    ]:
        convert_file(filepath, **output_args)

    print('finished')
Exemplo n.º 8
0
class DocumentIndex():
    def __init__(self, index_file=None, base_node=None, text_dir=None):
        print('index', index_file)
        self.index_file = index_file
        self.base_node = base_node
        self.text_dir = text_dir
        try:
            self.ct = CherryTree(index_file)
        except Exception as e:
            print(e)
            raise

    def documents(self, base=None):
        for node in self.ct.nodes(base):
            filelink = next((l for l in node.links if l.type == 'file'), None)
            if not filelink:
                continue
            document = self.load_document(filelink.href)
            if not document:
                continue
            yield Document(node, document, filelink)

    # def store_document_data(self, node, data):
    #     if not type(data) is str:
    #         content = dump_yaml(data)
    #     else:
    #         content = data
    #
    #     codebox = next((c for c in node.codeboxes), None)
    #     if codebox:
    #         codebox.content = content
    #     else:
    #         node.insert_codebox(content=dump_yaml(content), language='yaml')

    def save_index(self, added):
        indexed_files = [f for f in added if not f is None]
        if len(indexed_files) == 0:
            return 'No Documents Indexed'
        self.ct.save()
        for filename in indexed_files:
            print(f'stored {str(filename)} to {self.index_file}')
        return True

    def add_document(self, fp):

        if not fp.exists():
            return False
        elif not fp.is_file():
            return False
        elif not fp.suffix == '.md':
            return False

        document = read_file(str(fp))
        if not document:
            return False
        print(document.title, document.identifier)

        if self.ct.find_node_by_text(document.identifier):
            print(filepath, 'already indexed')
            return True
        base_node = self.ct.find_node_by_name(self.base_node) \
            or self.ct.insert_node(self.base_node)
        title = document.title or re.sub('\-|\_', ' ', fp.stem).title()
        node = self.ct.insert_node(title, parent=base_node)
        anchor = node.insert_anchor(name=document.identifier)
        link = node.insert_link(href=str(fp), text="File", sibling=anchor)
        new_line = node.insert_text(f'\n{"~" * 30}\n')
        return fp

    def add_from_filelist(self, filelist):
        fp = Path(filelist)
        added = []
        for filepath in [Path(f) for f in fp.read_text().split("\n")]:
            added.append(self.add_document(filepath))
        self.save_index(added)

    def add_from_filename(self, filepath):
        rs = self.add_document(filepath)
        if rs:
            self.save_index([rs])
        return True

    def add_from_stream(self):
        added = []
        for filepath in sys.stdin.readlines():
            fp = Path(filepath.strip())
            added.append(self.add_document(fp))
        self.save_index(added)
        return True

    def add_file_link(self):
        node_name = input('Node Name: ')
        node = self.ct.find_node_by_name(node_name)

        if not node:
            return f'Cannot find node {node_name}'

        fp = Path(self.text_dir,
                  node.name.replace(' ', "-").lower()).with_suffix('.md')

        filepath = input(f'Filepath: {str(fp)}') or str(fp)
        fp = Path(filepath)
        if not fp.exists():
            return (f'{str(filepath)} does not exist')
        document = read_file(filepath)
        identifier = document.identifier
        if not identifier:
            return f'document {filepath} has no identifier'
        anchor = node.insert_anchor(identifier)
        node.insert_link(href=filepath, text="Content")
        self.ct.save()
        return f'Link to {filepath} added to {node.name}'

    def export_to_file(self):
        node_name = input('Node Name: ')
        node = self.ct.find_node_by_name(node_name)

        if not node:
            return f'Cannot find node {node_name}'

        target_dir = input('Target Directory: ')
        td = Path(target_dir)
        if not td.exists():
            return (f'{target_dir} does not exist')
        fp = Path(self.text_dir, snake_case(node.name)).with_suffix('.md')

        filepath = input(f'Filepath: {str(fp)}') or str(fp)
        fp = Path(filepath)

        identifier = uuid4().hex[:8]
        outputfile = td.joinpath(node_name.replace(
            ' ', '_').lower()).with_suffix('.md')
        if outputfile.exists():
            print(f'{str(outputfile)} already exists')
        outputfile = str(outputfile)
        content = "\n".join([t for t in node.texts if len(t) > 0])
        try:
            pypandoc.convert_text(content,
                                  'markdown',
                                  format='markdown',
                                  outputfile=outputfile,
                                  extra_args=[
                                      f'--metadata=identifier:{identifier}',
                                      f'--metadata=title:{node.name}',
                                      '--defaults=create_document'
                                  ])
        except Exception as e:
            return e
        [
            e.getparent().remove(e)
            for e in node.element.iterchildren('rich_text')
        ]
        anchor = node.insert_anchor(identifier)
        node.insert_link(href=outputfile, text="Content")
        self.ct.save()
        return f'Notes from {node.name} written to {outputfile}'