示例#1
0
def get_meta(base_dir):
    corpus_folders = find_directories("%s/corpus" % (base_dir))
    corpus = dict()
    for folder in corpus_folders:
        corpus_name = os.path.basename(folder)
        meta = glob("%s/*meta*" % (folder))
        if len(meta) > 0:
            corpus[corpus_name] = meta
    return corpus
示例#2
0
def get_corpus(analysis_dir):
    corpus_folders = find_directories("%s/corpus" % (analysis_dir))
    corpus = dict()
    for folder in corpus_folders:
        corpus_name = os.path.basename(folder)
        sentences = glob("%s/*sentences*" % (folder))
        if len(sentences) > 0:
            corpus[corpus_name] = sentences
    return corpus
示例#3
0
def get_relations(base_dir,tags=None,read=False):
    edges = dict()
    if isinstance(tags,str):
        tags = [tags]
    relations_dir = "%s/relations" %(os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" %(tag))
        relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag))
        if len(relations_files) != 0:
            if read:
                edges[tag] = read_relations(relations_files)
            else:
                edges[tag] = relations_files       
    return edges
示例#4
0
def get_relations(base_dir, tags=None, read=False):
    edges = dict()
    if isinstance(tags, str):
        tags = [tags]
    relations_dir = "%s/relations" % (os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" % (tag))
        relations_files = glob("%s/%s/*_relations.json" % (relations_dir, tag))
        if len(relations_files) != 0:
            if read:
                edges[tag] = read_relations(relations_files)
            else:
                edges[tag] = relations_files
    return edges
示例#5
0
def get_relations_df(base_dir,tags=None):
    if isinstance(tags,str):
        tags = [tags]
    relations_dir = "%s/relations" %(os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" %(tag))
        relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag))
        term_names = numpy.unique([x.split("_")[0] for x in relations_files]).tolist()
        edges = pandas.DataFrame(columns=term_names,index=term_names)
        for r in range(len(relations_files)):
            relation_file = relations_files[r]
            print("Parsing %s of %s" %(r,len(relations_files)))
            term1,term2=os.path.basename(relation_file).split("_")[0:2]      
            edges.loc[term1,term2] = read_json(relation_file)["value"]
            edges.loc[term2,term1] = read_json(relation_file)["value"]
        relations[tag] = edges
    return relations
示例#6
0
def get_relations_df(base_dir,tags=None):
    if isinstance(tags,str):
        tags = [tags]
    relations_dir = "%s/relations" %(os.path.abspath(base_dir))
    if tags == None:
        tags = [os.path.basename(x) for x in find_directories(relations_dir)]
    for tag in tags:
        print("Finding relations for %s" %(tag))
        relations_files = glob("%s/%s/*_relations.json" %(relations_dir,tag))
        term_names = list(set([x.split("_")[0] for x in relations_files]))
        edges = pandas.DataFrame(columns=term_names,index=term_names)
        for r in range(len(relations_files)):
            relation_file = relations_files[r]
            print("Parsing %s of %s" %(r,len(relations_files)))
            term1,term2=os.path.basename(relation_file).split("_")[0:2]      
            edges.loc[term1,term2] = read_json(relation_file)["value"]
            edges.loc[term2,term1] = read_json(relation_file)["value"]
        relations[tag] = edges
    return relations
示例#7
0
def get_plugins(plugin_repo=None,load=False):
    '''get_plugins from a downloaded wordfish-plugins folder
    download plugin repo to destination folder
    Parameters
    ==========
    plugin_repo: path to plugins repo
    load: boolean
        if True, returns loaded json (dict). If false,
        returns paths to json files
    '''
    if plugin_repo == None:
        tmpdir = custom_app_download(repo_types=["plugins"])
        plugin_repo = "%s/plugins" %(tmpdir)

    plugins = find_directories(plugin_repo)
    valid_plugins = [p for p in plugins if validate(p)]
    print("Found %s valid plugins" %(len(valid_plugins)))
    if load == True:
        valid_plugins = load_plugins(valid_plugins)
    return valid_plugins
示例#8
0
def get_plugins(plugin_repo=None, load=False):
    '''get_plugins from a downloaded wordfish-plugins folder
    download plugin repo to destination folder
    Parameters
    ==========
    plugin_repo: path to plugins repo
    load: boolean
        if True, returns loaded json (dict). If false,
        returns paths to json files
    '''
    if plugin_repo == None:
        tmpdir = custom_app_download(repo_types=["plugins"])
        plugin_repo = "%s/plugins" % (tmpdir)

    plugins = find_directories(plugin_repo)
    valid_plugins = [p for p in plugins if validate(p)]
    print("Found %s valid plugins" % (len(valid_plugins)))
    if load == True:
        valid_plugins = load_plugins(valid_plugins)
    return valid_plugins
示例#9
0
def get_terms(analysis_dir,subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" %(os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)


        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" %term_plugin):
                terms_json = read_json("%s/terms.json" %term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" %(plugin_name,node["uid"])
                    else:
                        feature_name = node["name"].replace(" ","_")
                        uid = "%s::%s" %(plugin_name,feature_name) 
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" %term_plugin):
                terms_json = read_json("%s/term_relationships.json" %term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" %(plugin_name,relation["source"])
                    uid_2 = "%s::%s" %(plugin_name,relation["target"])
                    relation_uid = "%s<>%s" %(uid_1,uid_2)
                    edges[relation_uid] = {"source": uid_1,
                                           "target": uid_2,
                                           "value": relation["value"]}

            result = {"nodes":nodes,"edges":edges}
            if subset:
                results[plugin_name] = result
    
    if subset:
        result = results
    else:
        result = {"all":result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result,"%s/terms/terms.json" %(analysis_dir))
    return result
示例#10
0
def get_terms(analysis_dir, subset=True):
    '''
    For all terms defined, and relationships for the terms, parse into a single data structure
    This (maybe) won't work for larger datasets (we will use a database) but it will for testing.

        nodes:

            {"[plugin]::[uid]":[node]}

    Parameters
    ==========
    analysis_dir: path
        full path to analysis directory
    subset: boolean
        if True, returns terms in dictionary based on source tag. Default==False    
    '''

    nodes = dict()
    edges = dict()

    terms_dir = "%s/terms" % (os.path.abspath(analysis_dir))
    if os.path.exists(terms_dir):
        term_plugins = find_directories(terms_dir)

        nodes = dict()
        edges = dict()
        results = dict()

        for term_plugin in term_plugins:
            plugin_name = os.path.basename(term_plugin)

            if subset:
                nodes = dict()
                edges = dict()

            # Here we parse together terms
            if os.path.exists("%s/terms.json" % term_plugin):
                terms_json = read_json("%s/terms.json" % term_plugin)["nodes"]
                for node in terms_json:
                    if "uid" in node:
                        uid = "%s::%s" % (plugin_name, node["uid"])
                    else:
                        feature_name = node["name"].replace(" ", "_")
                        uid = "%s::%s" % (plugin_name, feature_name)
                    nodes[uid] = node

            # Here we parse together relationships
            # Currently only supported for terms within the same family
            if os.path.exists("%s/term_relationships.json" % term_plugin):
                terms_json = read_json("%s/term_relationships.json" %
                                       term_plugin)["edges"]
                for relation in terms_json:
                    uid_1 = "%s::%s" % (plugin_name, relation["source"])
                    uid_2 = "%s::%s" % (plugin_name, relation["target"])
                    relation_uid = "%s<>%s" % (uid_1, uid_2)
                    edges[relation_uid] = {
                        "source": uid_1,
                        "target": uid_2,
                        "value": relation["value"]
                    }

            result = {"nodes": nodes, "edges": edges}
            if subset:
                results[plugin_name] = result

    if subset:
        result = results
    else:
        result = {"all": result}
    # Return the result to user with all edges and nodes defined
    if analysis_dir is not None:
        tmp = save_pretty_json(result, "%s/terms/terms.json" % (analysis_dir))
    return result