Пример #1
0
def get_expanded_family_dict(sim_metric="path",
                             synset_names=False,
                             unique=True):
    from brainbehavior.nlp import do_stem
    families = get_families(sim_metric=sim_metric, synset_names=synset_names)
    if unique == True:
        # Finding an identical stem means that we would need to tell the word apart based
        # on context (eg, sensitive as a noun vs. an adjective. Since this method cannot
        # do that (the stems are both "sensit"), we have to simply merge the families
        family_index = get_family_index(families)
        # Combine families, and save list of all stems (including family)
        combined_families = dict()
        for stem, indices in family_index.iteritems():
            direction = []
            family = []
            similarity = []
            for idx in indices:
                direction = direction + families[idx]["direction"]
                family = family + do_stem(families[idx]["family"])
                similarity = similarity + families[idx]["similarity"]
            combined_families[stem] = {
                "base": stem,
                "family": family,
                "direction": direction,
                "similarity": similarity
            }
        return combined_families
    else:
        return families
Пример #2
0
def get_expanded_family_dict(sim_metric="path",synset_names=False,unique=True):
    from brainbehavior.nlp import do_stem
    families = get_families(sim_metric=sim_metric,synset_names=synset_names)
    if unique==True:
        # Finding an identical stem means that we would need to tell the word apart based
        # on context (eg, sensitive as a noun vs. an adjective. Since this method cannot
        # do that (the stems are both "sensit"), we have to simply merge the families
        family_index = get_family_index(families)
        # Combine families, and save list of all stems (including family)
        combined_families = dict()
        for stem, indices in family_index.iteritems():
            direction = []
            family = []
            similarity = []
            for idx in indices:
                direction = direction + families[idx]["direction"]
                family = family + do_stem(families[idx]["family"])
                similarity = similarity + families[idx]["similarity"]
            combined_families[stem] = {"base":stem,
                                   "family":family,
                                   "direction":direction,
                                   "similarity":similarity} 
        return combined_families
    else:
        return families                 
Пример #3
0
def get_expanded_behavior_list(sim_metric="path",synset_names=False):
    families = get_families(sim_metric=sim_metric,synset_names=synset_names)
    family_index = get_family_index(families)
    allstems = []
    for stem, indices in family_index.iteritems():
        allstems.append(stem)
        for idx in indices:
            allstems = allstems + do_stem(families[idx]["family"])
    return numpy.unique(allstems).tolist() 
Пример #4
0
def get_expanded_behavior_list(sim_metric="path", synset_names=False):
    families = get_families(sim_metric=sim_metric, synset_names=synset_names)
    family_index = get_family_index(families)
    allstems = []
    for stem, indices in family_index.iteritems():
        allstems.append(stem)
        for idx in indices:
            allstems = allstems + do_stem(families[idx]["family"])
    return numpy.unique(allstems).tolist()
Пример #5
0
def get_family_index(families):
    from brainbehavior.nlp import do_stem
    stems = []
    for f in range(0,len(families)):
        family = families[f]
        if isinstance(family["base"],str): 
            stems.append(do_stem(family["base"])[0])
        else: 
            stems.append(do_stem([family["base"].name().split(".")[0]])[0])
    stems = numpy.unique(stems).tolist()        
    family_index = dict((s, []) for s in stems)
    for f in range(0,len(families)):
        family = families[f]
        if isinstance(family["base"],str):
            stem = do_stem(family["base"])[0] 
            holder = family_index[stem]
            holder.append(f)
            family_index[stem] = holder 
        else: 
            stem = do_stem([family["base"].name().split(".")[0]])[0]
            holder = family_index[stem]
            holder.append(f)
            family_index[stem] = holder 
    return family_index
Пример #6
0
def get_family_index(families):
    from brainbehavior.nlp import do_stem
    stems = []
    for f in range(0, len(families)):
        family = families[f]
        if isinstance(family["base"], str):
            stems.append(do_stem(family["base"])[0])
        else:
            stems.append(do_stem([family["base"].name().split(".")[0]])[0])
    stems = numpy.unique(stems).tolist()
    family_index = dict((s, []) for s in stems)
    for f in range(0, len(families)):
        family = families[f]
        if isinstance(family["base"], str):
            stem = do_stem(family["base"])[0]
            holder = family_index[stem]
            holder.append(f)
            family_index[stem] = holder
        else:
            stem = do_stem([family["base"].name().split(".")[0]])[0]
            holder = family_index[stem]
            holder.append(f)
            family_index[stem] = holder
    return family_index
Пример #7
0
 def setUp(self):
     term_pickle = "../brainbehavior/data/cognitiveatlas/behavioraltraits.pkl"
     self.terms = pickle.load(open(term_pickle,"rb"))
     self.stems = do_stem(self.terms)
Пример #8
0
import sys

# Here is the path to the folder with xml files
topfolder = sys.argv[1]
subfolder = sys.argv[2]
term_pickle = sys.argv[3]
outfolder = sys.argv[4]

folder = "%s/%s" % (topfolder, subfolder)

# Get compressed files in folder.
zips = glob("%s/*.tar.gz" % folder)

# Read in our terms
terms = pickle.load(open(term_pickle, "rb"))
stems = do_stem(terms)

# We will save counts and total words
dfcount = pandas.DataFrame(columns=stems)
totalwords = []

# We want to keep a count of those with no terms
noterms = 0

for z in zips:
    zname = "%s/%s" % (subfolder, os.path.basename(z))
    text = get_xml_tree(z)
    counts = get_term_counts(terms, text)
    # Only save if we have at least one!
    if counts["count"].sum() > 0:
        totalwords.append(get_total_words(text))
Пример #9
0
                                    if len(current.body)>0:     
                                        content.append(current.body)
                                        ids.append(current.fullname)
                                        scores.append(current.score)
        except:
            print "Skipping %s" %sub.fullname

    print "%s has %s entities" %(disorder,len(content))
    result = {"content":content,"disorder":disorder,"score":scores,"uids":ids,"retrieved":now}
    pickle.dump(result,open("analysis/reddit/%s_dict.pkl" %disorder,"wb"))


### 2. COUNT TERMS ##################################################################
term_pickle = "brainbehavior/data/cognitiveatlas/behavioraltraits.pkl"
terms = pickle.load(open(term_pickle,"rb"))
stems = do_stem(terms)

pickles = glob("%s/*_dict.pkl" %outfolder)
for result_file in pickles:
    result = pickle.load(open(result_file,"rb"))
    content = result["content"]
    print "Parsing %s" %result_file
    # We will save counts and total words
    dfcount = pandas.DataFrame(columns=stems)
    totalwords = []
    # We want to keep a count of those with no terms
    noterms = 0
    for t in range(0,len(content)):
        print "%s of %s" %(t,len(content))
        text = ''.join([i if ord(i) < 128 else ' ' for i in content[t]])
        counts = get_term_counts(terms,str(text))