def get_region_owner(wordlist, region_indicator): """ Given a list of tokens (as Quepy Words), return a SAST (sub-)tree representing the containment relations (and any connectives) between the regions in the list (as identified by region_indicator). """ #print "get_region_owner wordlist", wordlist #print "get_region_owner region_indicator", region_indicator # NOTE: This treats ands/ors as binary operators. tokenlist = [x.token.lower() for x in wordlist] if 'and' in tokenlist: andidx = tokenlist.index('and') # NOTE: We currently treat 'and's as disjunctions return IsOrOp() \ + HasPart( get_region_owner(wordlist[0:andidx], region_indicator) ) \ + HasPart( get_region_owner(wordlist[andidx + 1:], region_indicator) ) elif 'or' in tokenlist: oridx = tokenlist.index('or') return IsOrOp() \ + HasPart( get_region_owner(wordlist[0:oridx], region_indicator) ) \ + HasPart( get_region_owner(wordlist[oridx + 1:], region_indicator) ) else: if tokenlist[0] == 'in': wordlist = wordlist[1:] previous_region = None for n in finditer(Question(Lemma('not')) + region_indicator, wordlist): r, s = n.span() region_name = ' '.join([comp.token for comp in wordlist[r:s]]).lower() current_region = get_name_expression(region_name) if previous_region: current_region = current_region + HasSubregion(previous_region) previous_region = current_region return previous_region
def has_specific_service_question(clauses): #具体类型 的 服务service print('\n与企业具体服务规则匹配') select = "?siname ?sidesc" sparql = None matches = [] keyword = None for i, clause in enumerate(clauses): print('问题子句', i, ':') for index, sW in enumerate(listW_service): for m in finditer(sW, clause): i, j = m.span() matches.extend(clause[i:j]) if len(matches) != 0: keyword = keywords_service[index] if keyword is not None: break for w in clause: if w.pos == pos_company: e = "?s vocab:company_chName '{company_name}'."\ "?s vocab:hasServiceType ?st."\ "?st vocab:hasService ?service."\ "?service vocab:service_name '{service_name}'."\ "?service vocab:hasServiceItem ?si."\ "?si vocab:serviceitem_name ?siname."\ "?si vocab:serviceitem_description ?sidesc".format(company_name=w.token,service_name=w.token+'-'+keyword) sparql = SPARQL_SELECT_TEM.format(prefix=SPARQL_PREXIX, select=select, expression=e) break return sparql
def test_finditer1(self): tab = self.a + self.b regex = tab * (2, None) strregex = re.compile("(?:ab){2,}") xs = list(refo.finditer(regex, self.seq)) strxs = list(strregex.finditer(self.string)) self._eq_list_n_stuff(xs, strxs)
def parse_element_into_books(html_elements): # Based on https://github.com/machinalis/refo/blob/master/examples/xml_reader.py is_header = lambda elem: elem.get('class').startswith('bookMain') is_highlight = lambda elem: elem.get('class').startswith('highlightRow') regex = Group(Predicate(is_header) + Plus(Predicate(is_highlight)), 'book') groups = [html_elements[g['book'][0]:g['book'][1]] for g in finditer(regex, html_elements)] return [Book(group) for group in groups]
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): i, j = m.span() matches.extend(sentence[i:j]) return self.action(matches), self.condition_num
def apply(self, sentence): matches = [] #拿着分好的词(整个输入的句子) 去跟 规则进行匹配 只留下匹配到的词(中间可能包含其他杂项) #如果一个句子中包含多个符合规则的短句,则将多个子句返回给action函数 但是只处理第一个子句 for m in finditer(self.condition, sentence): i, j = m.span() matches.extend([sentence[i:j]]) return self.action(matches), self.condition_num
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): i, j = m.span() matches.extend(sentence[i:j]) if __name__ == '__main__': pass return self.action(matches)
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): i, j = m.span() print(i, j) matches.extend(sentence[i:j]) if __name__ == '__main__': print("----------applying %s----------" % self.action.__name__) return self.action(matches)
def test_finditer2(self): tab = self.a + self.b regex = tab * (2, None) + refo.Group(refo.Plus(self.b), "foobar") strregex = re.compile("(?:ab){2,}(b+)") xs = list(refo.finditer(regex, self.seq)) strxs = list(strregex.finditer(self.string)) xs = [x.group("foobar") for x in xs] strxs = [x.span(1) for x in strxs] self.assertListEqual(xs, strxs)
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): i, j = m.span() matches.extend(sentence[i:j]) if len(matches) == 0: return None else: return self.action()
def apply(self, sentence): matches = [] # finditer返回一个可迭代对象 for m in finditer(self.condition, sentence): # i,j 为字符串的起始位置 i, j = m.span() matches.extend(sentence[i:j]) print('matchs:') print(self.action(matches), self.condition_num) return self.action(matches), self.condition_num
def parse_element_into_books(html_elements): # Based on https://github.com/machinalis/refo/blob/master/examples/xml_reader.py is_header = lambda elem: elem.get('class').startswith('bookMain') is_highlight = lambda elem: elem.get('class').startswith('highlightRow') regex = Group(Predicate(is_header) + Plus(Predicate(is_highlight)), 'book') groups = [ html_elements[g['book'][0]:g['book'][1]] for g in finditer(regex, html_elements) ] return [Book(group) for group in groups]
def apply(self, word_list): #因为可能满足条件的有多处,所以用matches列表存储 matches = [] # 用条件去找匹配的词汇,finditer 里面用到了yeild,就是每次找到一个结果返回一次,继续找 # 可以理解为finditer 返回的值可以迭代 for m in finditer(self.condition, word_list): i, j = m.span() matches.extend(word_list[i:j]) # 提取出被匹配的句子区间划出,其中可能有其他杂词汇 return self.action(matches), self.condition_num
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): i, j = m.span() matches.extend(sentence[i:j]) if len(matches) == 0: return None, None else: # for i in matches: # print i.token, i.pos return self.action(matches), self.condition_num
def apply(self, sentence): matches = [] for m in finditer(self.condition, sentence): # print 1 i, j = m.span() # print 1, i,j matches.extend(sentence[i:j]) if __name__ == '__main__': print "----------applying %s----------" % self.action.__name__ # print matches return self.action(matches)
def interpret_ClearSomeCommand(self, match): command = IsCommand("clear") if getattr(match, "clear_quant", None): # TODO: Don't use finditer; just do a search for m in finditer(Pos("CD"), match.clear_quant): i, j = m.span() # TODO: Join with a space? Is the list ever longer than 1, anyway? num = ' '.join([c.token for c in match.clear_quant[i:j]]) command = command + HasEqualTo(num) else: command = command + HasEqualTo("1") return command, "enum"
def build_mod_tree(wordlist, synaptic_to): """ Given a list of tokens (as Quepy Words), return a SAST (sub-)tree representing the detected neuron attributes ("modifiers") and any connectives. synaptic_to represents the phrase which this sequence is "(pre-/post-)synaptic to". """ #print "build_mod_tree wordlist", wordlist #print "build_mod_tree synaptic_to", synaptic_to tokenlist = [x.token.lower() for x in wordlist] # We make multiple passes, but "hopefully" the modifier list is not that long. for idx, token in enumerate(tokenlist): if (token == 'and' or token == 'or') and len(tokenlist) > idx + 2: # NOTE: We assume something would come after the following 'and'/'or'. if tokenlist[idx + 2] == 'and': # NOTE: We currently treat 'and's as disjunctions return IsOrOp() \ + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \ + HasPart( build_mod_tree(wordlist[idx + 3:], synaptic_to) ) elif tokenlist[idx + 2] == 'or': return IsOrOp() \ + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \ + HasPart( build_mod_tree(wordlist[idx + 3:], synaptic_to) ) # If no such ("explicit") 'and'/'or' nodes are found, look for "silent and" nodes. for idx, token in enumerate(tokenlist): if (token == 'and' or token == 'or') and len(tokenlist) > idx + 2: # Based on the scan above, we can assume the idx+2 is not 'and'/'or'. # So we treat it as a "silent and". return IsAndOp() \ + HasPart( build_mod_tree(wordlist[0:idx + 2], synaptic_to) ) \ + HasPart( build_mod_tree(wordlist[idx + 2:], synaptic_to) ) # If we've made it this far, we can assume there's zero or one 'and'/'or's. mods = [] for n in finditer(Predicate(lowercase_is_in(modifiers_and_regions)), wordlist): r, s = n.span() mod_name = tokenlist[r:s][0] mods.append(mod_name) if len(mods) == 1: return get_name_expression(mods[0], synaptic_to) # NOTE: (A reminder..:) We assume 'and'/'or' are not biologically relevant terms... # NOTE: We currently treat 'and's as disjunctions if 'or' in tokenlist or 'and' in tokenlist: op = IsOrOp() else: # NOTE: We assume juxtaposition of modifiers signifies logical 'and' op = IsAndOp() for mod in mods: op += HasPart(get_name_expression(mod, synaptic_to)) return op
def has_company_basicinfo_question(clauses): print('\n与企业属性规则匹配') #公司属性 select = "?x" sparql = None for i, clause in enumerate(clauses): print('问题子句', i, ':') keyword = None matches = [] for index, cbW in enumerate(listW_company_basic): for m in finditer(cbW, clause): i, j = m.span() matches.extend(clause[i:j]) if len(matches) != 0: keyword = keyWord_company_baisc[index] if keyword is not None: break for w in clause: if w.pos == pos_company: if keyword == 'company_description': # select = "?x ?y" # e = "?s vocab:company_chName '{company_name}'."\ # "?s vocab:company_baidubaikeDescription ?x."\ # "?s vocab:company_kuaidi100Description ?y.".format(company_name=w.token) select = "?y" e = "?s vocab:company_chName '{company_name}'."\ "?s vocab:company_kuaidi100Description ?y.".format(company_name=w.token) else: e = "?s vocab:company_chName '{company_name}'."\ "?s vocab:{keyword} ?x.".format(company_name=w.token, keyword=keyword) sparql = SPARQL_SELECT_TEM.format(prefix=SPARQL_PREXIX, select=select, expression=e) break return sparql
def apply(self, sentence): for m in finditer(self.condition, sentence): i, j = m.span() return self.action(sentence[i:j])
def apply(self, word_objects): matches = [] for m in finditer(self.condition, word_objects): i, j = m.span() matches.extend(word_objects[i:j]) return self.action(matches), self.condition_weight, self.description
parser.add_argument("filename", action="store") cfg = parser.parse_args() text = open(cfg.filename).read() from refo import finditer, Predicate, Literal, Any, Group, Star def notin(xs): return lambda x: x not in xs name = Predicate(notin("/")) + Star(Predicate(notin(" >"))) name = Group(name, "name") inside = name + Star(Any(), greedy=False) opentag = Literal("<") + inside + Literal(">") opentag = Group(opentag, "open") closetag = Literal("<") + Literal("/") + inside + Literal(">") closetag = Group(closetag, "close") regex = closetag | opentag depth = 0 for m in finditer(regex, text): if "open" in m: i, j = m["name"] print " " * depth + text[i:j] depth += 1 else: assert "close" in m depth -= 1
def apply(self, sentence): for m in finditer(self.condition, sentence): i, j = m.span() if "victim" in m: i, j = m.span("victim") self.action(sentence[i:j])
def main(): ##set_file_name = raw_input('Enter a file name: ') file_name = raw_input('Enter a file name: ') test_file = open(file_name, 'r') rawtext = test_file.read() ##GET ALL KEYWORDS get_all_keywords = [] #Extract title from text title = get_title(rawtext) first_sen = get_first_sen(rawtext) #Get paragraph without title para_list = rawtext.splitlines()[1:] #in list para_string = ''.join(para_list) #convert to string #Prettify paragraph prettify_txt = re.sub(r'[^\w.]', ' ', para_string) mod_txt = remov_stopword(prettify_txt) #Tokenizing & POS Tagging token_txt = nltk.sent_tokenize(mod_txt) #Line Segment num_sent = len(token_txt) #Number of sentences token_word = [nltk.word_tokenize(sent) for sent in token_txt] pos_tag = [nltk.pos_tag(sent) for sent in token_word] ##print title print "Sentence: ", num_sent print '\n' #Chunk and print NP get_nouns = [[Word(*x) for x in sent] for sent in pos_tag] #NNP Rules rule_0 = W(pos = "NNS")| W(pos = "NN") | W(pos = "NNP") rule_05 = W(pos = "NNP") + W(pos = "NNS") rule_1 = W(pos = "WP$") + W(pos = "NNS") rule_2 = W(pos = "CD") + W(pos = "NNS") rule_3 = W(pos = "NN") + W(pos = "NN") rule_4 = W(pos = "NN") + W(pos = "NNS") rule_5 = W(pos = "NNP") + W(pos = "CD") rule_6 = W(pos = "NNP") + W(pos = "NNP") rule_7 = W(pos = "NNP") + W(pos = "NNPS") rule_8 = W(pos = "NNP") + W(pos = "NN") rule_9 = W(pos = "NNP") + W(pos = "VBZ") rule_10 = W(pos = "DT") + W(pos = "NNS") rule_11 = W(pos = "DT") + W(pos = "NN") rule_12 = W(pos = "DT") + W(pos = "NNP") rule_13 = W(pos = "JJ") + W(pos = "NN") rule_14 = W(pos = "JJ") + W(pos = "NNS") rule_15 = W(pos = "PRP$") + W(pos = "NNS") rule_16 = W(pos = "PRP$") + W(pos = "NN") rule_02 = W(pos = "NN") + W(pos = "NN") + W(pos = "NN") rule_17 = W(pos = "NN") + W(pos = "NNS") + W(pos = "NN") rule_18 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP") rule_19 = W(pos = "JJ") + W(pos = "NN") + W(pos = "NNS") rule_20 = W(pos = "PRP$") + W(pos = "NN") + W(pos = "NN") rule_21 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN") rule_22 = W(pos = "DT") + W(pos = "CD") + W(pos = "NNS") rule_23 = W(pos = "DT") + W(pos = "VBG") + W(pos = "NN") rule_24 = W(pos = "DT") + W(pos = "NN") + W(pos = "NN") rule_25 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "VBZ") rule_26 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") rule_27 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") rule_28 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN") rule_29 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP") rule_30 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") + W(pos = "NN") NP_bi_gram_set = (rule_05)|(rule_1)|(rule_2)|(rule_3)|(rule_4)|(rule_5)|(rule_6)|(rule_7)|(rule_8)|(rule_9)|(rule_10)|(rule_11)|(rule_12)|(rule_13)|(rule_14)|(rule_15)|(rule_16) NP_tri_gram_set = (rule_02)|(rule_17)|(rule_18)|(rule_19)|(rule_20)|(rule_21)|(rule_22)|(rule_23)|(rule_24)|(rule_25)|(rule_26)|(rule_27)|(rule_28) NP_quard_gram_set = (rule_29)|(rule_30) #Rule set function get_uni_gram = (rule_0) get_bi_gram = NP_bi_gram_set get_tri_gram = NP_tri_gram_set get_quard_gram = NP_quard_gram_set bag_of_NP = [] bag_of_biNP = [] bag_of_triNP = [] bag_of_fourNP = [] total__tfidf = 0 ###################################GET UNIGRAMS################################### ##print "UNIGRAM -->" for k, s in enumerate(get_nouns): for match in finditer(get_uni_gram, s): x, y = match.span() #the match spans x to y inside the sentence s #print pos_tag[k][x:y] bag_of_NP += pos_tag[k][x:y] ############### #Term Frequency for unigrams ##print "\nUnigram Feature Matrices:" total__tfidf = 0 uni_tfidf_values = '' str_uni_grams = '' total_docs = count_total_corpus() fdist = nltk.FreqDist(bag_of_NP) print fdist ##STORE UNIGRAMS unzipped_uni = zip(*bag_of_NP) str_unigrams = list(unzipped_uni[0]) get_unigrams = zip(str_unigrams,str_unigrams[1:])[::1] ############### ##UNI MAXIMUM TermScore## scores = [] for word in fdist: score = fdist[word] scores.append(score) max_uni = max(scores) ###################### for word in fdist: fq_word = fdist[word] ##print '%s->%d' % (word, fq_word) get_tf = term_frequency(fq_word, max_uni) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ## ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr ##GET EACH UNIGRAMS TFIDF uni_tfidf_scr = repr(tf_idf_scr)+' ' uni_tfidf_values += uni_tfidf_scr str_uni_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_uni_float = [float(x) for x in uni_tfidf_values.split()] get_uni_list = str_uni_grams.split(',') unigram_dict = dict(zip(get_unigrams, get_uni_float)) ########################### ##GET TFIDF FOR UNIGRAMS## ############ uni_avg_tfidf = (sum(map(float,get_uni_float)))/(len(get_uni_float)) ########################### get_zip_str = [''.join(item) for item in str_unigrams] ###Unigrams string with TFIDF### unigrams_list = zip(get_zip_str, get_uni_float) ########################### ##print '===============***===============' ## print 'Total Unigrams: ', len(fdist) ## print 'Total tfidf', total__tfidf ##print 'Average TF.IDF: ', uni_avg_tfidf ##print '===============***===============' ########################### ##### TFIDF FEATURE MATRIX ##### uni_feat_tfidf = [] for x in unigrams_list: if float(x[1]) > uni_avg_tfidf: uni_feat_tfidf.append(1) else: uni_feat_tfidf.append(0) zip_tfidf_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf) ##print zip_tfidf_feat ############################### ##### First Sentence Feat ##### uni_fir_sen = [] for x in unigrams_list: get_res = chk_frs_sen(x[0], file_name) if get_res == 1: uni_fir_sen.append(1) else: uni_fir_sen.append(0) zip_fir_sen_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen) ############################ ##### Involve in Title ##### uni_title_feat = [] for x in unigrams_list: get_res = involve_in_title(x[0], title) if get_res == 1: uni_title_feat.append(1) else: uni_title_feat.append(0) zip_uni_feats = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat) ##print zip_uni_feats ################################ ##print "\n\n" ###################################GET BIGRAMS################################### ##print "BIGRAM -->" for k, s in enumerate(get_nouns): for match in finditer(get_bi_gram, s): x, y = match.span() ##print pos_tag[k][x:y] bag_of_biNP += pos_tag[k][x:y] ##Term Frequency for bigrams## total__tfidf = 0 bi_tfidf_values = '' str_bi_grams = '' ############### ##STORE BIGRAMS unzipped = zip(*bag_of_biNP) str_bigrams = list(unzipped[0]) get_bigrams = zip(str_bigrams,str_bigrams[1:])[::2] ############### ##print "\nBigram Feature Matrices:" bi_dist = nltk.FreqDist(bag_of_biNP) ##BI MAXIMUM TermScore## bi_scores = [] for word in bi_dist: score = bi_dist[word] bi_scores.append(score) max_bi = max(bi_scores) ###################### for word in bi_dist: tq_word = bi_dist[word] ##print '%s-->%d' % (word, tq_word) get_tf = term_frequency(tq_word, max_bi) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf*idf_score total__tfidf += tf_idf_scr ##GET EACH BIGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' bi_tfidf_values += get_tfidf_scr str_bi_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_float = [float(x) for x in bi_tfidf_values.split()] get_bi_list = str_bi_grams.split(',') bigram_dict = dict(zip(get_bi_list, get_float)) ########################### ##GET TFIDF FOR BIGRAMS## get_bi_floats = get_val_bipairs(bigram_dict, get_bigrams) get_zip = dict(zip(get_bigrams, get_bi_floats)) ############ real_avg_tfidf = (sum(map(float,get_bi_floats)))/(len(get_bi_floats)) ########################### get_zip_str = [' '.join(item) for item in get_bigrams] ###Bigrams string with TFIDF### bigrams_list = zip(get_zip_str, get_bi_floats) ########################### ##print bigrams_list ##print '===============***===============' ##print 'Total Bigrams: ', len(get_bi_floats) ##print 'total tfidf: ', sum(map(float,get_bi_floats)) ##print 'Average TF.IDF: ', real_avg_tfidf ##print '===============***===============' ##print len(bi_str2_float(bi_tfidf_values)) ##print type(bag_of_biNP) ##### TFIDF FEATURE MATRIX ##### feat_tfidf_matx = [] for x in bigrams_list: if float(x[1]) > real_avg_tfidf: feat_tfidf_matx.append(1) else: feat_tfidf_matx.append(0) tfidf_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx) ################################# #### FIRST SENTENCE FEATURE #### feat_fir_sen = [] for x in tfidf_feat: get_res = chk_frs_sen(x[0], file_name) if get_res == 1: feat_fir_sen.append(1) else: feat_fir_sen.append(0) fir_sen_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen) ##print fir_sen_feat ################################# #### INVOLVE IN TITLE FEATURE ### feat_invol_tit = [] for x in fir_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: feat_invol_tit.append(1) else: feat_invol_tit.append(0) invol_tit_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit) ##print invol_tit_feat ################################# ##print "\n\n" ###################################GET TRIGRAMS################################### ##print "TRIGRAM -->" for k, s in enumerate(get_nouns): for match in finditer(get_tri_gram, s): x, y = match.span() ##print pos_tag[k][x:y] bag_of_triNP += pos_tag[k][x:y] #Term Frequency for trigrams total__tfidf = 0 tri_tfidf_values = '' str_tri_grams = '' ############### ##STORE TRIGRAMS unzipped_tri = zip(*bag_of_triNP) str_trigrams = list(unzipped_tri[0]) get_trigrams = zip(str_trigrams,str_trigrams[1:],str_trigrams[2:])[::3] ############### ##print "\nTrigram Feature Matrices:" tri_dist = nltk.FreqDist(bag_of_triNP) ##TRI MAXIMUM TermScore## tri_scores = [] for word in tri_dist: score = tri_dist[word] tri_scores.append(score) max_tri = max(tri_scores) ###################### for word in tri_dist: tr_fq = tri_dist[word] ##print '%s-->%d' % (word, tr_fq) get_tf = term_frequency(tr_fq, max_tri) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ## ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr ##GET EACH TRIGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' tri_tfidf_values += get_tfidf_scr str_tri_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_tri_float = [float(x) for x in tri_tfidf_values.split()] get_tri_list = str_tri_grams.split(',') trigram_dict = dict(zip(get_tri_list, get_tri_float)) ########################### ##GET TFIDF FOR TRIGRAMS## get_tri_floats = get_val_tripairs(trigram_dict, get_trigrams) get_tri_zip = dict(zip(get_trigrams, get_tri_floats)) ############ tri_avg_tfidf = (sum(map(float,get_tri_floats)))/(len(get_tri_floats)) ########################### get_ziptri_str = [' '.join(item) for item in get_trigrams] ###Bigrams string with TFIDF### trigrams_list = zip(get_ziptri_str, get_tri_floats) ########################### ##print '===============***===============' ##print 'Total Trigrams: ', len(get_tri_floats) ##print 'Total tfidf', sum(map(float,get_tri_floats)) ##print 'Average TF.IDF: ', tri_avg_tfidf ##print '===============***===============' ##### TFIDF FEATURE MATRIX ##### tri_tfidf_matx = [] for x in trigrams_list: if float(x[1]) > tri_avg_tfidf: tri_tfidf_matx.append(1) else: tri_tfidf_matx.append(0) tri_tfidf_feat = zip(get_ziptri_str, get_tri_floats, tri_tfidf_matx) ################################ #### FIRST SENTENCE FEATURE #### tri_fir_sen = [] for x in tri_tfidf_feat: get_res = chk_frs_sen(x[0], file_name) if get_res == 1: tri_fir_sen.append(1) else: tri_fir_sen.append(0) tri_sen_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen) ################################# #### INVOLVE IN TITLE FEATURE ### tri_invol_tit = [] for x in tri_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: tri_invol_tit.append(1) else: tri_invol_tit.append(0) tri_tit_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen, tri_invol_tit) ##print tri_tit_feat ################################# ##print "\n\n" ###################################GET 4-GRAMS################################### ##print "4th GRAM -->" for k, s in enumerate(get_nouns): for match in finditer(get_quard_gram, s): x,y = match.span() ##print pos_tag[k][x:y] bag_of_fourNP += pos_tag[k][x:y] #Term Frequency for 4-grams total__tfidf = 0 four_tfidf_values = '' str_four_grams = '' ############### if (len(bag_of_fourNP)>0): ##STORE 4-GRAMS unzipped_four = zip(*bag_of_fourNP) str_fourgrams = list(unzipped_four[0]) get_fourgrams = zip(str_fourgrams,str_fourgrams[1:],str_fourgrams[2:],str_fourgrams[3:])[::4] ############### #Term Frequency for 4-grams total__tfidf = 0 ##print "\n4-grams Feature Matrices:" f_dist = nltk.FreqDist(bag_of_fourNP) ##4 MAXIMUM TermScore## four_scores = [] for word in f_dist: score = f_dist[word] four_scores.append(score) max_four = max(four_scores) ###################### for word in f_dist: fr_fq = f_dist[word] ##print '%s-->%d' % (word, fr_fq) get_tf = term_frequency(fr_fq, max_four) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ## ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr ##GET EACH FOURGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' four_tfidf_values += get_tfidf_scr str_four_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_four_float = [float(x) for x in four_tfidf_values.split()] get_four_list = str_four_grams.split(',') fourgram_dict = dict(zip(get_four_list, get_four_float)) ########################### ##GET TFIDF FOR 4-GRAMS## get_four_floats = get_val_fpairs(fourgram_dict, get_fourgrams) get_four_zip = dict(zip(get_fourgrams, get_four_floats)) ############ four_avg_tfidf = (sum(map(float,get_four_floats)))/(len(get_four_floats)) ########################### get_zipfour_str = [' '.join(item) for item in get_fourgrams] ###Bigrams string with TFIDF### fourgrams_list = zip(get_zipfour_str, get_four_floats) ########################### ##print '===============***===============' ##print 'Total 4-grams: ', len(get_four_floats) ##print 'Total tfidf', sum(map(float,get_four_floats)) ##print 'Average TF.IDF: ', four_avg_tfidf ##print '===============***===============' ##### TFIDF FEATURE MATRIX ##### four_tfidf_matx = [] for x in fourgrams_list: if float(x[1]) > four_avg_tfidf: four_tfidf_matx.append(1) else: four_tfidf_matx.append(0) four_tfidf_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx) ################################# #### FIRST SENTENCE FEATURE #### four_fir_sen = [] for x in four_tfidf_feat: get_res = chk_frs_sen(x[0], file_name) if get_res == 1: four_fir_sen.append(1) else: four_fir_sen.append(0) four_sen_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen) ################################# #### INVOLVE IN TITLE FEATURE ### four_invol_tit = [] for x in tri_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: four_invol_tit.append(1) else: four_invol_tit.append(0) four_tit_feat = zip (get_zipfour_str, get_four_floats,four_tfidf_matx, four_fir_sen, four_invol_tit) ##print four_tit_feat ################################# else: four_tit_feat = '' print 'Zero Fourgram\n' ##print zip_uni_feats, invol_tit_feat, tri_tit_feat, four_tit_feat ##print uni_avg_tfidf,real_avg_tfidf, tri_avg_tfidf,four_avg_tfidf key_unigram = cal_matrix(zip_uni_feats, uni_avg_tfidf,'uni_tf.txt','uni_fs.txt','uni_tit.txt') print '\n' key_bigram = cal_matrix(invol_tit_feat, real_avg_tfidf,'bi_tf.txt','bi_fs.txt','bi_tit.txt') print '\n' key_trigram = cal_tri_matrix(tri_tit_feat, tri_avg_tfidf,'tri_tf.txt','tri_fs.txt','tri_tit.txt') print '\n' if not four_tit_feat: print 'No 4-grams in document.' get_all_keywords = key_unigram + key_bigram + key_trigram print len(get_all_keywords),' keywords for total n-grams.' get_time = (time.time() - start_time) get_milli = get_time*1000 print("--- %s seconds ---" % get_time) else: key_four = cal_four_matrix(four_tit_feat, four_avg_tfidf,'four_tf.txt','four_fs.txt','four_tit.txt') ##get_all_keywords = key_unigram + key_bigram + key_trigram + key_four get_all_keywords = key_unigram + key_bigram + key_trigram + key_four print len(get_all_keywords),' keywords for total n-grams.' get_time = (time.time() - start_time) get_milli = get_time*1000 print("--- %s seconds ---" % get_time) ##GET SUMMARY## summary(key_unigram, title, prettify_txt)
def interpret_NeuronsQuery_MoreGeneral(self, match): #print "interpret_NeuronsQuery_MoreGeneral", match._words, match.words, match._particles neuron = IsNeuron() + HasClass('Neuron') # NOTE: "format" group overrides any "opener" group--for formatting # e.g. "List neurons in Lamina as morphology" will use morphology formatting. if getattr(match, 'formatting', None): form_lems = match.formatting.lemmas.lower() if 'list' in form_lems or 'information' in form_lems: neuron = neuron + HasFormat('information') elif 'network' in form_lems: neuron = neuron + HasFormat('network') # NOTE: We don't even bother checking for "morphology", since that's assumed default. elif getattr(match, 'opener', None): form_lems = match.opener.lemmas.lower() if 'list' in form_lems: neuron = neuron + HasFormat('information') elif 'graph' in form_lems: neuron = neuron + HasFormat('network') if getattr(match, 'region_list', None): neuron = neuron + OwnedBy( get_region_owner(match.region_list, notneurons)) if getattr(match, 'neuron_modifiers', None): mods = [] # NOTE: The following assumes whitespace separates JJs / NNs: for m in finditer((Pos("JJ") | Pos("NN")), match.neuron_modifiers): i, j = m.span() mod_name = match.neuron_modifiers[i:j][0].token.lower() mods.append(mod_name) if len(mods) == 1: modifier = get_name_expression(mods[0]) neuron = neuron + Has(modifier) elif len(mods) > 1: # NOTE: We assume this is a disjunction of modifiers for now andop = IsOrOp() for mod in mods: modifier = get_name_expression(mod) andop += HasPart(modifier) neuron += Has(andop) if getattr(match, "transmitters", None): mods = [] # NOTE: The following assumes whitespace separates adjnouns: for m in finditer(adjnoun, match.transmitters): i, j = m.span() mod_name = match.transmitters[i:j][0].token.lower() mods.append(mod_name) if len(mods) == 1: modifier = get_name_expression(mods[0]) neuron = neuron + Has(modifier) elif len(mods) > 1: # NOTE: We assume this is a disjunction of modifiers for now andop = IsOrOp() for mod in modifiers: modifier = get_name_expression(mod) andop += HasPart(modifier) neuron += Has(andop) if getattr(match, "neuron_name", None): # In keeping with the "spirit of the tree" (as described in the codegen file), # we put the neuron name in the neuron node if there's only 1 name; # We only need to create a 'has' node if there's more than 1 name (e.g. and, or). # For now, we only support one name to begin with. neuron = neuron + HasName(match.neuron_name.tokens) if getattr(match, "expressing_marker", None): marker = IsGeneticMarker() + HasName(match.expressing_marker.lemmas) neuron = neuron + HasGeneticMarker(marker) if getattr(match, "conn_quant", None): quantdir = IsNumConnections() # TODO: Don't use finditer; just do a search. Clean this up. for n in finditer(Pos("CD"), match.conn_quant): r, s = n.span() conn_num = ' '.join([c.token for c in match.conn_quant[r:s]]) moreorless = False for o in finditer(Lemmas("more than"), match.conn_quant): quantdir = quantdir + HasMoreThan(conn_num) moreorless = True for o in finditer(Lemmas("less than"), match.conn_quant): quantdir = quantdir + HasLessThan(conn_num) moreorless = True if not moreorless: quantdir = quantdir + HasEqualTo(conn_num) for n in finditer( Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"), match.conn_quant): r, s = n.span() conn_target = ' '.join([c.token for c in match.conn_quant[r:s]]) # TODO: Make conn_target.lower() ? quantdir = quantdir + HasConnectionsTarget(conn_target) neuron = neuron + HasConnections(quantdir) # neuron_label = NameOf( neuron ) # return neuron_label, "enum" return neuron, "enum"
def get_subquery(m, matchwords): neuron = IsNeuron() + HasClass('Neuron') owned_region = None global syn_num #print matchwords #print m.state if 'synapse_num_clause' in m: p, q = m['synapse_num_clause'] conn_quant_words = matchwords[p:q] # TODO: Perform a search instead of finditer for n in finditer(Pos("CD"), conn_quant_words): r, s = n.span() conn_num = ' '.join( [c.token for c in conn_quant_words[r:s]]) moreorless = False # See above... for o in finditer(Lemmas("more than"), conn_quant_words): syn_num = HasMoreThan(conn_num) moreorless = True for o in finditer(Lemmas("less than"), conn_quant_words): syn_num = HasLessThan(conn_num) moreorless = True for o in finditer(Lemma("atleast"), conn_quant_words): syn_num = HasAtLeast(conn_num) moreorless = True for o in finditer(Lemma("atmost"), conn_quant_words): syn_num = HasAtMost(conn_num) moreorless = True for o in finditer(Lemmas("at least"), conn_quant_words): syn_num = HasAtLeast(conn_num) moreorless = True for o in finditer(Lemmas("at most"), conn_quant_words): syn_num = HasAtMost(conn_num) moreorless = True if not moreorless: syn_num = HasEqualTo(conn_num) #print "syn_num", syn_num # The (sub-)subquery which this subquery is "(pre-/post-)synaptic to". synaptic_to = None if 'synaptic_phrase' in m: p, q = m['synaptic_phrase'] synaptic_phrase = matchwords[p:q] # TODO: Clean this up. spl = [w.lemma.lower() for w in synaptic_phrase] to_idx = spl.index('to') syn_type = set() or_syns = False if 'presynaptic' in spl[:to_idx]: syn_type.add('presynaptic') if 'postsynaptic' in spl[:to_idx]: syn_type.add('postsynaptic') if 'or' in spl[:to_idx]: or_syns = True # NOTE: We currently only support one subquery here, anyway. for n in finditer(self.subquery, synaptic_phrase[to_idx + 1:]): r, s = n.span() synaptic_to, _ = get_subquery(n, synaptic_phrase[to_idx + 1:]) if 'presynaptic' in syn_type: neuron += PresynapticTo(synaptic_to) elif 'postsynaptic' in syn_type: neuron += PostsynapticTo(synaptic_to) if syn_num: neuron += syn_num # This is basically just a trick to update the existing ("parent") subquery. # TODO: Clean this up. for m in finditer(self.subquery, matchwords[:p]): break if 'region_list' in m: p, q = m['region_list'] owned_region = get_region_owner( matchwords[p:q], Predicate(lowercase_is_in(regions))) neuron = neuron + OwnedBy(owned_region) # We identify transmitters and neuron types with the "has" relation (e.g. in the SAST) # so to support conjunctions/disjunctions of these modifiers, while also keeping the SAST # "simple" with at most one "has" relation per node, we calculate the "has" relations later has_modifiers = [] if 'neuron_modifiers' in m: p, q = m['neuron_modifiers'] modifiers_words = [x for x in matchwords[p:q] if x.pos != ','] has_modifiers.append( build_mod_tree(modifiers_words, synaptic_to)) if 'transmitters' in m: p, q = m['transmitters'] modifiers_words = [x for x in matchwords[p:q] if x.pos != ','] has_modifiers.append( build_mod_tree(modifiers_words, synaptic_to)) if 'neurons' in m: p, q = m['neurons'] # NOTE: We assume that this can only be "interneuron(s)" or "neuron(s)" if 'interneuron' in ''.join([x.lemma for x in matchwords[p:q]]): has_modifiers.append(IsAttribute() + HasKey('locality') + HasValue('True')) else: # NOTE: For now, we assume that a neuron 'name/type' (and not "neuron") is present # for n in finditer( Pos("CD"), conn_quant_words ): pass if 'expressing_marker' in m: p, q = m['expressing_marker'] expressing_lemmas = [x.lemma for x in matchwords[p:q]] # This is just a temporary solution--before support for genetic markers is added. marker = IsGeneticMarker() + HasName( ' '.join(expressing_lemmas)) neuron = neuron + HasGeneticMarker(marker) # TODO: Include this as a 'has' relation (as above)? if 'conn_quant' in m: # NOTE: This is currently unused by the code generator p, q = m['conn_quant'] conn_quant_words = matchwords[p:q] quantdir = IsNumConnections() # TODO: Perform a search instead of finditer for n in finditer(Pos("CD"), conn_quant_words): r, s = n.span() conn_num = ' '.join( [c.token for c in conn_quant_words[r:s]]) moreorless = False # See above... for o in finditer(Lemmas("more than"), conn_quant_words): quantdir = quantdir + HasMoreThan(conn_num) moreorless = True for o in finditer(Lemmas("less than"), conn_quant_words): quantdir = quantdir + HasLessThan(conn_num) moreorless = True if not moreorless: quantdir = quantdir + HasEqualTo(conn_num) for n in finditer( Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"), conn_quant_words): r, s = n.span() conn_target = ' '.join( [c.token for c in conn_quant_words[r:s]]) # TODO: Make conn_target.lower() ? quantdir = quantdir + HasConnectionsTarget(conn_target) neuron = neuron + HasConnections(quantdir) if 'connections_clause' in m: p, q = m['connections_clause'] connections_words = [ x for x in matchwords[p:q] if x.pos != ',' ] connectives = [] segments = [[]] seg_idx = 0 for word in connections_words: if word.lemma.lower() == 'and': connectives.append('and') segments.append([]) seg_idx += 1 continue if word.lemma.lower() == 'or': connectives.append('or') segments.append([]) seg_idx += 1 continue segments[seg_idx].append(word) # NOTE: We assume that no "segment" list is empty--based on our grammar # Scan from left to right in 'connections_clause' last_conn_type = None region_name = None connection_nodes = [] for segment in segments: # NOTE: The order of these loops (which shouldn't be loops) currently matters. # NOTE: We assume no region has these terms in their name/synonyms. # TODO: Clean this up. for n in finditer( Lemma('connection') | Lemma('process') | Lemma('arborization') | Lemma('arborizations') | Lemma('arborize') | Lemma('innervate') | Lemma('innervation'), segment): last_conn_type = 'arbors' break for n in finditer( Lemma('dendrite') | Lemma('input') | Lemma('dendritic'), segment): last_conn_type = 'dendriteArbors' break for n in finditer( Lemma('axon') | Lemma('axonal') | Lemma('axonic') | Lemma('output'), segment): last_conn_type = 'axonArbors' break for n in finditer(Predicate(lowercase_is_in(arbregions)), segment): r, s = n.span() region_name = ' '.join( [comp.token for comp in segment[r:s]]).lower() if region_name not in arbregions: log.error('Unknown region name: ' + region_name) # TODO: Handle gracefully. else: region_name = arbregions[region_name] # NOTE: We assume there's exactly one region per segment # NOTE: We assume last_conn_type was set at least initially--based on our grammar conn_region = HasType(last_conn_type) conn_region += HasRegion(region_name) connection_nodes.append(conn_region) # NOTE: We assume there is at least one element in connection_nodes # NOTE: We assume the number of connectives is one less than len(connection_nodes) if len(connection_nodes) == 1: has_modifiers.append(connection_nodes[0]) else: connective = None if connectives.pop(0) == 'and': connective = IsAndOp() + HasPart(connection_nodes.pop(0)) \ + HasPart(connection_nodes.pop(0)) else: connective = IsOrOp() + HasPart(connection_nodes.pop(0)) \ + HasPart(connection_nodes.pop(0)) while len(connectives) > 0: if connectives.pop(0) == 'and': connective = IsAndOp() + HasPart(connective) \ + HasPart(connection_nodes.pop(0)) else: connective = IsOrOp() + HasPart(connective) \ + HasPart(connection_nodes.pop(0)) has_modifiers.append(connective) if 'is_connecting' in m: p, q = m['is_connecting'] region_pair = [] for n in finditer(Predicate(lowercase_is_in(arbregions)), matchwords[p:q]): r, s = n.span() r, s = r + p, s + p # Get the offset from matchwords region_name = ' '.join( [comp.token for comp in matchwords[r:s]]).lower() if region_name not in arbregions: log.error('Unknown region name: ' + region_name) # TODO: Handle gracefully else: region_pair.append(arbregions[region_name]) # Check that there were exactly two regions. NOTE: This could be enforced by the grammar. if len(region_pair) == 2: # NOTE: We assume the first region we parse is the "from" region. if 'and' in [x.lemma for x in matchwords[p:q]]: conn1 = IsConnection() + FromRegion( region_pair[0]) + ToRegion(region_pair[1]) conn2 = IsConnection() + FromRegion( region_pair[1]) + ToRegion(region_pair[0]) connecting_node = IsOrOp() + HasPart(conn1) + HasPart( conn2) else: # NOTE: We assume 'to' is present connecting_node = IsConnection() + FromRegion( region_pair[0]) + ToRegion(region_pair[1]) # NOTE: At least for now, we'll put connections in with the 'has' relations # but NOTE that this only works if codegen_optimization to true. # neuron += Connecting( connecting_node ) has_modifiers.append(connecting_node) # Now create a single "has node" for this subquery's neuron. if len(has_modifiers) > 1: # NOTE: We assume all 'has' objects are "conjuncted together". has_node = IsAndOp() for mod in has_modifiers: has_node += HasPart(mod) neuron += Has(has_node) elif len(has_modifiers) == 1: neuron += Has(has_modifiers[0]) return neuron, owned_region
"... sort of.") parser.add_argument("filename", action="store") cfg = parser.parse_args() text = open(cfg.filename).read() from refo import finditer, Predicate, Literal, Any, Group, Star def notin(xs): return lambda x: x not in xs name = Predicate(notin("/")) + Star(Predicate(notin(" >"))) name = Group(name, "name") inside = name + Star(Any(), greedy=False) opentag = Literal("<") + inside + Literal(">") opentag = Group(opentag, "open") closetag = Literal("<") + Literal("/") + inside + Literal(">") closetag = Group(closetag, "close") regex = closetag | opentag depth = 0 for m in finditer(regex, text): if "open" in m: i, j = m["name"] print(" " * depth + text[i:j]) depth += 1 else: assert "close" in m depth -= 1
def interpret_NeuronsQuery_MoreSpecific(self, match): # NOTE: If a subquery has a prepositional phrase attached (e.g. "in [regions]"), # then we should see if the preceding subqueries lack a prepositional phrase. # By default, attach the prep. phrase to the preceding subqueries as well. # But we'd prefer to alert the user and have them check this. # subquery_list is a list of tuples, where the first element is the Expression tree (SAST) # and the second element contains the sub-tree corresponding to any owned_by region(s) #print "interpret_NeuronsQuery_MoreSpecific", match._words, match.words, match._particles global syn_num syn_num = None subquery_list = [] for mtch in finditer(self.subquery, match.words): i, j = mtch.span() #for x in mtch.state: # print x, mtch.state[x] def get_subquery(m, matchwords): neuron = IsNeuron() + HasClass('Neuron') owned_region = None global syn_num #print matchwords #print m.state if 'synapse_num_clause' in m: p, q = m['synapse_num_clause'] conn_quant_words = matchwords[p:q] # TODO: Perform a search instead of finditer for n in finditer(Pos("CD"), conn_quant_words): r, s = n.span() conn_num = ' '.join( [c.token for c in conn_quant_words[r:s]]) moreorless = False # See above... for o in finditer(Lemmas("more than"), conn_quant_words): syn_num = HasMoreThan(conn_num) moreorless = True for o in finditer(Lemmas("less than"), conn_quant_words): syn_num = HasLessThan(conn_num) moreorless = True for o in finditer(Lemma("atleast"), conn_quant_words): syn_num = HasAtLeast(conn_num) moreorless = True for o in finditer(Lemma("atmost"), conn_quant_words): syn_num = HasAtMost(conn_num) moreorless = True for o in finditer(Lemmas("at least"), conn_quant_words): syn_num = HasAtLeast(conn_num) moreorless = True for o in finditer(Lemmas("at most"), conn_quant_words): syn_num = HasAtMost(conn_num) moreorless = True if not moreorless: syn_num = HasEqualTo(conn_num) #print "syn_num", syn_num # The (sub-)subquery which this subquery is "(pre-/post-)synaptic to". synaptic_to = None if 'synaptic_phrase' in m: p, q = m['synaptic_phrase'] synaptic_phrase = matchwords[p:q] # TODO: Clean this up. spl = [w.lemma.lower() for w in synaptic_phrase] to_idx = spl.index('to') syn_type = set() or_syns = False if 'presynaptic' in spl[:to_idx]: syn_type.add('presynaptic') if 'postsynaptic' in spl[:to_idx]: syn_type.add('postsynaptic') if 'or' in spl[:to_idx]: or_syns = True # NOTE: We currently only support one subquery here, anyway. for n in finditer(self.subquery, synaptic_phrase[to_idx + 1:]): r, s = n.span() synaptic_to, _ = get_subquery(n, synaptic_phrase[to_idx + 1:]) if 'presynaptic' in syn_type: neuron += PresynapticTo(synaptic_to) elif 'postsynaptic' in syn_type: neuron += PostsynapticTo(synaptic_to) if syn_num: neuron += syn_num # This is basically just a trick to update the existing ("parent") subquery. # TODO: Clean this up. for m in finditer(self.subquery, matchwords[:p]): break if 'region_list' in m: p, q = m['region_list'] owned_region = get_region_owner( matchwords[p:q], Predicate(lowercase_is_in(regions))) neuron = neuron + OwnedBy(owned_region) # We identify transmitters and neuron types with the "has" relation (e.g. in the SAST) # so to support conjunctions/disjunctions of these modifiers, while also keeping the SAST # "simple" with at most one "has" relation per node, we calculate the "has" relations later has_modifiers = [] if 'neuron_modifiers' in m: p, q = m['neuron_modifiers'] modifiers_words = [x for x in matchwords[p:q] if x.pos != ','] has_modifiers.append( build_mod_tree(modifiers_words, synaptic_to)) if 'transmitters' in m: p, q = m['transmitters'] modifiers_words = [x for x in matchwords[p:q] if x.pos != ','] has_modifiers.append( build_mod_tree(modifiers_words, synaptic_to)) if 'neurons' in m: p, q = m['neurons'] # NOTE: We assume that this can only be "interneuron(s)" or "neuron(s)" if 'interneuron' in ''.join([x.lemma for x in matchwords[p:q]]): has_modifiers.append(IsAttribute() + HasKey('locality') + HasValue('True')) else: # NOTE: For now, we assume that a neuron 'name/type' (and not "neuron") is present # for n in finditer( Pos("CD"), conn_quant_words ): pass if 'expressing_marker' in m: p, q = m['expressing_marker'] expressing_lemmas = [x.lemma for x in matchwords[p:q]] # This is just a temporary solution--before support for genetic markers is added. marker = IsGeneticMarker() + HasName( ' '.join(expressing_lemmas)) neuron = neuron + HasGeneticMarker(marker) # TODO: Include this as a 'has' relation (as above)? if 'conn_quant' in m: # NOTE: This is currently unused by the code generator p, q = m['conn_quant'] conn_quant_words = matchwords[p:q] quantdir = IsNumConnections() # TODO: Perform a search instead of finditer for n in finditer(Pos("CD"), conn_quant_words): r, s = n.span() conn_num = ' '.join( [c.token for c in conn_quant_words[r:s]]) moreorless = False # See above... for o in finditer(Lemmas("more than"), conn_quant_words): quantdir = quantdir + HasMoreThan(conn_num) moreorless = True for o in finditer(Lemmas("less than"), conn_quant_words): quantdir = quantdir + HasLessThan(conn_num) moreorless = True if not moreorless: quantdir = quantdir + HasEqualTo(conn_num) for n in finditer( Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS"), conn_quant_words): r, s = n.span() conn_target = ' '.join( [c.token for c in conn_quant_words[r:s]]) # TODO: Make conn_target.lower() ? quantdir = quantdir + HasConnectionsTarget(conn_target) neuron = neuron + HasConnections(quantdir) if 'connections_clause' in m: p, q = m['connections_clause'] connections_words = [ x for x in matchwords[p:q] if x.pos != ',' ] connectives = [] segments = [[]] seg_idx = 0 for word in connections_words: if word.lemma.lower() == 'and': connectives.append('and') segments.append([]) seg_idx += 1 continue if word.lemma.lower() == 'or': connectives.append('or') segments.append([]) seg_idx += 1 continue segments[seg_idx].append(word) # NOTE: We assume that no "segment" list is empty--based on our grammar # Scan from left to right in 'connections_clause' last_conn_type = None region_name = None connection_nodes = [] for segment in segments: # NOTE: The order of these loops (which shouldn't be loops) currently matters. # NOTE: We assume no region has these terms in their name/synonyms. # TODO: Clean this up. for n in finditer( Lemma('connection') | Lemma('process') | Lemma('arborization') | Lemma('arborizations') | Lemma('arborize') | Lemma('innervate') | Lemma('innervation'), segment): last_conn_type = 'arbors' break for n in finditer( Lemma('dendrite') | Lemma('input') | Lemma('dendritic'), segment): last_conn_type = 'dendriteArbors' break for n in finditer( Lemma('axon') | Lemma('axonal') | Lemma('axonic') | Lemma('output'), segment): last_conn_type = 'axonArbors' break for n in finditer(Predicate(lowercase_is_in(arbregions)), segment): r, s = n.span() region_name = ' '.join( [comp.token for comp in segment[r:s]]).lower() if region_name not in arbregions: log.error('Unknown region name: ' + region_name) # TODO: Handle gracefully. else: region_name = arbregions[region_name] # NOTE: We assume there's exactly one region per segment # NOTE: We assume last_conn_type was set at least initially--based on our grammar conn_region = HasType(last_conn_type) conn_region += HasRegion(region_name) connection_nodes.append(conn_region) # NOTE: We assume there is at least one element in connection_nodes # NOTE: We assume the number of connectives is one less than len(connection_nodes) if len(connection_nodes) == 1: has_modifiers.append(connection_nodes[0]) else: connective = None if connectives.pop(0) == 'and': connective = IsAndOp() + HasPart(connection_nodes.pop(0)) \ + HasPart(connection_nodes.pop(0)) else: connective = IsOrOp() + HasPart(connection_nodes.pop(0)) \ + HasPart(connection_nodes.pop(0)) while len(connectives) > 0: if connectives.pop(0) == 'and': connective = IsAndOp() + HasPart(connective) \ + HasPart(connection_nodes.pop(0)) else: connective = IsOrOp() + HasPart(connective) \ + HasPart(connection_nodes.pop(0)) has_modifiers.append(connective) if 'is_connecting' in m: p, q = m['is_connecting'] region_pair = [] for n in finditer(Predicate(lowercase_is_in(arbregions)), matchwords[p:q]): r, s = n.span() r, s = r + p, s + p # Get the offset from matchwords region_name = ' '.join( [comp.token for comp in matchwords[r:s]]).lower() if region_name not in arbregions: log.error('Unknown region name: ' + region_name) # TODO: Handle gracefully else: region_pair.append(arbregions[region_name]) # Check that there were exactly two regions. NOTE: This could be enforced by the grammar. if len(region_pair) == 2: # NOTE: We assume the first region we parse is the "from" region. if 'and' in [x.lemma for x in matchwords[p:q]]: conn1 = IsConnection() + FromRegion( region_pair[0]) + ToRegion(region_pair[1]) conn2 = IsConnection() + FromRegion( region_pair[1]) + ToRegion(region_pair[0]) connecting_node = IsOrOp() + HasPart(conn1) + HasPart( conn2) else: # NOTE: We assume 'to' is present connecting_node = IsConnection() + FromRegion( region_pair[0]) + ToRegion(region_pair[1]) # NOTE: At least for now, we'll put connections in with the 'has' relations # but NOTE that this only works if codegen_optimization to true. # neuron += Connecting( connecting_node ) has_modifiers.append(connecting_node) # Now create a single "has node" for this subquery's neuron. if len(has_modifiers) > 1: # NOTE: We assume all 'has' objects are "conjuncted together". has_node = IsAndOp() for mod in has_modifiers: has_node += HasPart(mod) neuron += Has(has_node) elif len(has_modifiers) == 1: neuron += Has(has_modifiers[0]) return neuron, owned_region subquery_list.append(get_subquery(mtch, match.words)) # We could attach the prep. phrases (e.g. "in [regions]") to previous subqueries # only if they don't already have their own prep. phrase. """ subquery_list = subquery_list[::-1] prev_ownedby = subquery_list[0][1] for i, (subq, ownedby) in enumerate( subquery_list ): if prev_ownedby is not None: if ownedby is None: # TODO: Check that Python is okay with these 'is's and 'not's. subquery_list[i][0] += OwnedBy( prev_ownedby ) else: prev_ownedby = ownedby """ if len(subquery_list) == 1: final_query = subquery_list[0][0] else: # NOTE: We currently assume set union across subqueries final_query = IsOrOp() # NOTE: If prep. phrase attaching, ownedby data should be considered stale at this point; # queries themselves would have been updated with "owned_by" relation info. for subq, ownedby in subquery_list: final_query += HasPart(subq) formatting = None # NOTE: We parse queries with an opener for each subquery, but currently only use the last if getattr(match, 'opener', None): form_lems = match.opener.lemmas if 'add' in form_lems: final_query += HasVerb('add') elif 'remove' in form_lems: final_query += HasVerb('remove') elif 'keep' in form_lems or 'retain' in form_lems: final_query += HasVerb('keep') elif 'list' in form_lems: formatting = 'information' elif 'graph' in form_lems: formatting = 'network' elif 'unpin' in form_lems: final_query += HasVerb('unpin') elif 'pin' in form_lems: final_query += HasVerb('pin') elif 'uncolor' in form_lems: final_query += HasVerb('uncolor') elif 'color' in form_lems: final_query += HasVerb('color') # NOTE: We only check for colors if 'color' is the verb if getattr(match, 'color', None): hue = match.color.lemmas if hue in colors_values: hue = colors_values[hue] else: # It's hex for a color if hue.startswith('#'): hue = hue[1:] # NOTE: We assume right-most bit of given hex is LSB hue = '0' * (6 - len(hue)) + hue final_query += HasColor(hue) # Not exactly natural language... elif 'unanimate' in form_lems or 'unblink' in form_lems: final_query += HasVerb('unblink') elif 'animate' in form_lems or 'blink' in form_lems: final_query += HasVerb('blink') elif 'unhide' in form_lems: final_query += HasVerb('unhide') elif 'hide' in form_lems: final_query += HasVerb('hide') if 'reveal' in form_lems: final_query += HasVerb('reveal') # NOTE: "format" group overrides any "opener" group--for formatting # e.g. "List neurons in Lamina as morphology" will use morphology formatting. # TODO: What about 'show gabaergic neurons as? [color]' or 'as? [blinking]' ? if getattr(match, 'formatting', None): form_lems = match.formatting.lemmas if 'list' in form_lems or 'information' in form_lems: formatting = 'information' elif 'network' in form_lems: formatting = 'network' elif 'morphology' in form_lems: formatting = 'morphology' if formatting: final_query += HasFormat(formatting) return final_query, "enum"
def main(): get_total = count_total_corpus() count = 0 f_name = str(count+1) uni_collection = [] bi_collection = [] tri_collection = [] four_collection = [] while (count < get_total): n_files = str(count+1) get_doc = open('traindata/doc'+n_files+'.txt', 'r') raw_doc = get_doc.read() ##Extract title## title = get_title(raw_doc) ##Extract First&Last Sentence## fir_sen = get_first_sen(raw_doc) last_sen = get_last_sen(raw_doc) get_last = last_sen.split(',') get_length = len(get_last) #### KEYWORD SECTION #### x=0 key_unigram = '' key_bigram = '' key_trigram = '' key_fourgram = '' key_unknown = '' while (x<get_length): get_len = len(get_last[x].split()) if (get_len == 1): key_unigram += get_last[x]+',' elif (get_len == 2): key_bigram += get_last[x]+',' elif (get_len == 3): key_trigram += get_last[x]+',' elif (get_len == 4): key_fourgram += get_last[x]+',' else: key_unknown += get_last[x]+',' x += 1 ### GET IN LIST ### key_unis = key_unigram.split(',') key_bis = key_bigram.split(',') key_tris = key_trigram.split(',') key_fours = key_fourgram.split(',') key_uns = key_unknown.split(',') ##print key_unis, key_bis, key_tris, key_fours, key_uns get_content = raw_doc.splitlines()[1:] #List form after_last_sen = get_content[:-1] content_str = ''.join(after_last_sen) #content in String format prettify_txt = re.sub(r'[^\w.]',' ', content_str) ##mod_txt = remov_stopword(prettify_txt) token_txt = nltk.sent_tokenize(prettify_txt) ##Number of Sentence: len(token_txt)## token_word = [nltk.word_tokenize(sent) for sent in token_txt] pos_tag = [nltk.pos_tag(sent) for sent in token_word] ##Chunking and printing NP## get_nouns = [[Word(*x) for x in sent] for sent in pos_tag] ##NNP Rules## rule_0 = W(pos = "NNS")| W(pos = "NNS")| W(pos = "NN") | W(pos = "NNP") rule_05 = W(pos = "NNP") + W(pos = "NNS") rule_1 = W(pos = "WP$") + W(pos = "NNS") rule_2 = W(pos = "CD") + W(pos = "NNS") rule_3 = W(pos = "NN") + W(pos = "NN") rule_4 = W(pos = "NN") + W(pos = "NNS") rule_5 = W(pos = "NNP") + W(pos = "CD") rule_6 = W(pos = "NNP") + W(pos = "NNP") rule_7 = W(pos = "NNP") + W(pos = "NNPS") rule_8 = W(pos = "NNP") + W(pos = "NN") rule_9 = W(pos = "NNP") + W(pos = "VBZ") rule_10 = W(pos = "DT") + W(pos = "NNS") rule_11 = W(pos = "DT") + W(pos = "NN") rule_12 = W(pos = "DT") + W(pos = "NNP") rule_13 = W(pos = "JJ") + W(pos = "NN") rule_14 = W(pos = "JJ") + W(pos = "NNS") rule_15 = W(pos = "PRP$") + W(pos = "NNS") rule_16 = W(pos = "PRP$") + W(pos = "NN") rule_02 = W(pos = "NN") + W(pos = "NN") + W(pos = "NN") rule_17 = W(pos = "NN") + W(pos = "NNS") + W(pos = "NN") rule_18 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP") rule_19 = W(pos = "JJ") + W(pos = "NN") + W(pos = "NNS") rule_20 = W(pos = "PRP$") + W(pos = "NN") + W(pos = "NN") rule_21 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN") rule_22 = W(pos = "DT") + W(pos = "CD") + W(pos = "NNS") rule_23 = W(pos = "DT") + W(pos = "VBG") + W(pos = "NN") rule_24 = W(pos = "DT") + W(pos = "NN") + W(pos = "NN") rule_25 = W(pos = "NNP") + W(pos = "NNP") + W(pos = "VBZ") rule_26 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") rule_27 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") rule_28 = W(pos = "DT") + W(pos = "JJ") + W(pos = "NN") rule_29 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NNP") + W(pos = "NNP") rule_30 = W(pos = "DT") + W(pos = "NNP") + W(pos = "NN") + W(pos = "NN") NP_bi_gram_set = (rule_05)|(rule_1)|(rule_2)|(rule_3)|(rule_4)|(rule_5)|(rule_6)|(rule_7)|(rule_8)|(rule_9)|(rule_10)|(rule_11)|(rule_12)|(rule_13)|(rule_14)|(rule_15)|(rule_16) NP_tri_gram_set = (rule_02)|(rule_17)|(rule_18)|(rule_19)|(rule_20)|(rule_21)|(rule_22)|(rule_23)|(rule_24)|(rule_25)|(rule_26)|(rule_27)|(rule_28) NP_quard_gram_set = (rule_29)|(rule_30) #Rule set function get_uni_gram = (rule_0) get_bi_gram = NP_bi_gram_set get_tri_gram = NP_tri_gram_set get_quard_gram = NP_quard_gram_set bag_of_NP = [] bag_of_biNP = [] bag_of_triNP = [] bag_of_fourNP = [] total__tfidf = 0 ####################### for k, s in enumerate(get_nouns): for match in finditer(get_uni_gram, s): x, y = match.span() #the match spans x to y inside the sentence s ##print pos_tag[k][x:y] bag_of_NP += pos_tag[k][x:y] for k, s in enumerate(get_nouns): for match in finditer(get_bi_gram, s): x, y = match.span() ##print pos_tag[k][x:y] bag_of_biNP += pos_tag[k][x:y] for k, s in enumerate(get_nouns): for match in finditer(get_tri_gram, s): x, y = match.span() ##print pos_tag[k][x:y] bag_of_triNP += pos_tag[k][x:y] for k, s in enumerate(get_nouns): for match in finditer(get_quard_gram, s): x,y = match.span() ##print pos_tag[k][x:y] bag_of_fourNP += pos_tag[k][x:y] ##### GETTING EACH WORD TFIDF ##### uni_tfidf_values = '' str_uni_grams = '' total_docs = count_total_corpus() fdist = nltk.FreqDist(bag_of_NP) unzip_unigram = zip(*bag_of_NP) str_unigrams = list(unzip_unigram[0]) ##UNI MAXIMUM TermScore## scores = [] for word in fdist: score = fdist[word] scores.append(score) max_uni = max(scores) ###################### for word in fdist: fq_word = fdist[word] get_tf = term_frequency(fq_word, max_uni) to_string = ':'.join(word) get_this_string = convert_to_string(to_string) num_of_doc_word = count_nterm_doc(get_this_string) idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr uni_tfidf_scr = repr(tf_idf_scr)+' ' uni_tfidf_values += uni_tfidf_scr str_uni_grams += get_this_string+',' get_uni_float = [float(x) for x in uni_tfidf_values.split()] get_uni_list = str_uni_grams.split(',') unigram_dict = dict(zip(get_uni_list, get_uni_float)) ##### GET TFIDF FOR UNIGRAMS & AVERAGE TFIDF VALUES ##### uni_avg_tfidf = (sum(map(float, get_uni_float)))/(len(get_uni_float)) get_zip_str = [''.join(item) for item in str_unigrams] unigrams_list = zip(get_zip_str, get_uni_float) ##### TFIDF FEATURE MATRIX ##### uni_feat_tfidf = [] for x in unigrams_list: if float(x[1]) > uni_avg_tfidf: uni_feat_tfidf.append(1) else: uni_feat_tfidf.append(0) zip_tfidf_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf) ############################### ##### First Sentence Feat ##### uni_fir_sen = [] for x in unigrams_list: file_name = 'traindata/doc'+f_name+'.txt' get_res = chk_frs_sen(x[0], file_name) if get_res == 1: uni_fir_sen.append(1) else: uni_fir_sen.append(0) zip_fir_sen_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen) ############################ ##### Involve in Title ##### uni_title_feat = [] for x in unigrams_list: get_res = involve_in_title(x[0], title) if get_res == 1: uni_title_feat.append(1) else: uni_title_feat.append(0) zip_uni_feats = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat) ############################ ##### KEYWORD OR NOT ##### key_uni_matx = [] for x in unigrams_list: get_res = chk_keyword(x[0],key_unis) if get_res == 1: key_uni_matx.append(1) else: key_uni_matx.append(0) zip_uni_all_feat = zip(get_zip_str, get_uni_float, uni_feat_tfidf, uni_fir_sen, uni_title_feat, key_uni_matx) ######################################################### ##### GETTING BIGRAMS ##### ##Term Frequency for bigrams## total__tfidf = 0 bi_tfidf_values = '' str_bi_grams = '' unzip_bigram = zip(*bag_of_biNP) str_bigrams = list(unzip_bigram[0]) get_bigrams = zip(str_bigrams, str_bigrams[1:])[::2] bi_dist = nltk.FreqDist(bag_of_biNP) ##BI MAXIMUM TermScore## bi_scores = [] for word in bi_dist: score = bi_dist[word] bi_scores.append(score) max_bi = max(bi_scores) ###################### for word in bi_dist: tq_word = bi_dist[word] get_tf = term_frequency(tq_word, max_bi) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf*idf_score total__tfidf += tf_idf_scr ##GET EACH BIGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' bi_tfidf_values += get_tfidf_scr str_bi_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_float = [float(x) for x in bi_tfidf_values.split()] get_bi_list = str_bi_grams.split(',') bigram_dict = dict(zip(get_bi_list, get_float)) ########################### ##GET TFIDF FOR BIGRAMS## get_bi_floats = get_val_bipairs(bigram_dict, get_bigrams) get_zip = dict(zip(get_bigrams, get_bi_floats)) ############ real_avg_tfidf = (sum(map(float,get_bi_floats)))/(len(get_bi_floats)) ########################### get_zip_str = [' '.join(item) for item in get_bigrams] ###Bigrams string with TFIDF### bigrams_list = zip(get_zip_str, get_bi_floats) ##### TFIDF FEATURE MATRIX ##### feat_tfidf_matx = [] for x in bigrams_list: if float(x[1]) > real_avg_tfidf: feat_tfidf_matx.append(1) else: feat_tfidf_matx.append(0) tfidf_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx) ################################# #### FIRST SENTENCE FEATURE #### feat_fir_sen = [] for x in tfidf_feat: file_name = 'traindata/doc'+f_name+'.txt' get_res = chk_frs_sen(x[0], file_name) if get_res == 1: feat_fir_sen.append(1) else: feat_fir_sen.append(0) fir_sen_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen) #### INVOLVE IN TITLE FEATURE ### feat_invol_tit = [] for x in fir_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: feat_invol_tit.append(1) else: feat_invol_tit.append(0) invol_tit_feat = zip (get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit) ##### KEYWORD OR NOT ##### key_bi_matx = [] for x in bigrams_list: get_res = chk_keyword(x[0],key_bis) if get_res == 1: key_bi_matx.append(1) else: key_bi_matx.append(0) zip_bi_all_feat = zip(get_zip_str, get_bi_floats, feat_tfidf_matx, feat_fir_sen, feat_invol_tit, key_bi_matx) ##################################### ##### GETTING TRIGRAMS ##### #Term Frequency for trigrams total__tfidf = 0 tri_tfidf_values = '' str_tri_grams = '' unzip_trigram = zip(*bag_of_triNP) str_trigrams = list(unzip_trigram[0]) get_trigrams = zip(str_trigrams, str_trigrams[1:], str_trigrams[2:])[::3] tri_dist = nltk.FreqDist(bag_of_triNP) ##TRI MAXIMUM TermScore## tri_scores = [] for word in tri_dist: score = tri_dist[word] tri_scores.append(score) max_tri = max(tri_scores) ###################### for word in tri_dist: tr_fq = tri_dist[word] get_tf = term_frequency(tr_fq, max_tri) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ## ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr ##GET EACH TRIGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' tri_tfidf_values += get_tfidf_scr str_tri_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_tri_float = [float(x) for x in tri_tfidf_values.split()] get_tri_list = str_tri_grams.split(',') trigram_dict = dict(zip(get_tri_list, get_tri_float)) ########################### ##GET TFIDF FOR TRIGRAMS## get_tri_floats = get_val_tripairs(trigram_dict, get_trigrams) get_tri_zip = dict(zip(get_trigrams, get_tri_floats)) ############ tri_avg_tfidf = (sum(map(float,get_tri_floats)))/(len(get_tri_floats)) ########################### get_ziptri_str = [' '.join(item) for item in get_trigrams] ###Bigrams string with TFIDF### trigrams_list = zip(get_ziptri_str, get_tri_floats) ########################### ##### TFIDF FEATURE MATRIX ##### tri_tfidf_matx = [] for x in trigrams_list: if float(x[1]) > tri_avg_tfidf: tri_tfidf_matx.append(1) else: tri_tfidf_matx.append(0) tri_tfidf_feat = zip(get_ziptri_str, get_tri_floats, tri_tfidf_matx) ################################ #### FIRST SENTENCE FEATURE #### tri_fir_sen = [] for x in tri_tfidf_feat: file_name = 'traindata/doc'+f_name+'.txt' get_res = chk_frs_sen(x[0], file_name) if get_res == 1: tri_fir_sen.append(1) else: tri_fir_sen.append(0) tri_sen_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen) ################################# #### INVOLVE IN TITLE FEATURE ### tri_invol_tit = [] for x in tri_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: tri_invol_tit.append(1) else: tri_invol_tit.append(0) tri_tit_feat = zip (get_ziptri_str, get_tri_floats, tri_tfidf_matx, tri_fir_sen, tri_invol_tit) ################################################## ##### KEYWORD OR NOT ##### key_tri_matx = [] for x in trigrams_list: get_res = chk_keyword(x[0],key_tris) if get_res == 1: key_tri_matx.append(1) else: key_tri_matx.append(0) zip_tri_all_feat = zip(get_ziptri_str, get_tri_float, tri_tfidf_matx, tri_fir_sen, tri_invol_tit, key_tri_matx) ######################################################### ##### GETTING 4-GRAMS ##### #Term Frequency for 4-grams if (len(bag_of_fourNP)>0): total__tfidf = 0 four_tfidf_values = '' str_four_grams = '' ############### unzip_fourgram = zip(*bag_of_fourNP) str_fourgrams = list(unzip_fourgram[0]) get_fourgrams = zip(str_fourgrams, str_fourgrams[1:], str_fourgrams[2:], str_fourgrams[3:])[::4] ############################ f_dist = nltk.FreqDist(bag_of_fourNP) ##4 MAXIMUM TermScore## four_scores = [] for word in f_dist: score = f_dist[word] four_scores.append(score) max_four = max(four_scores) ###################### for word in f_dist: fr_fq = f_dist[word] get_tf = term_frequency(fr_fq, max_four) ### FEATURES ### ##Tuple to String## to_string = ':'.join(word) get_this_string = convert_to_string(to_string) ##DF Score num_of_doc_word = count_nterm_doc(get_this_string) ##TF.IDF Score idf_score = inverse_df(total_docs, num_of_doc_word) tf_idf_scr = get_tf * idf_score total__tfidf += tf_idf_scr ##GET EACH FOURGRAMS TFIDF get_tfidf_scr = repr(tf_idf_scr)+' ' four_tfidf_values += get_tfidf_scr str_four_grams += get_this_string+',' ##BUILD DICT FOR EACH TERMS get_four_float = [float(x) for x in four_tfidf_values.split()] get_four_list = str_four_grams.split(',') fourgram_dict = dict(zip(get_four_list, get_four_float)) ########################### ##GET TFIDF FOR 4-GRAMS## get_four_floats = get_val_fpairs(fourgram_dict, get_fourgrams) get_four_zip = dict(zip(get_fourgrams, get_four_floats)) ############ four_avg_tfidf = (sum(map(float,get_four_floats)))/(len(get_four_floats)) ########################### get_zipfour_str = [' '.join(item) for item in get_fourgrams] ###Bigrams string with TFIDF### fourgrams_list = zip(get_zipfour_str, get_four_floats) ########################### ##### TFIDF FEATURE MATRIX ##### four_tfidf_matx = [] for x in fourgrams_list: if float(x[1]) > four_avg_tfidf: four_tfidf_matx.append(1) else: four_tfidf_matx.append(0) four_tfidf_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx) ################################# #### FIRST SENTENCE FEATURE #### four_fir_sen = [] for x in four_tfidf_feat: file_name = 'traindata/doc'+f_name+'.txt' get_res = chk_frs_sen(x[0], file_name) if get_res == 1: four_fir_sen.append(1) else: four_fir_sen.append(0) four_sen_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen) ################################# #### INVOLVE IN TITLE FEATURE ### four_invol_tit = [] for x in tri_sen_feat: get_res = involve_in_title(x[0], title) if get_res == 1: four_invol_tit.append(1) else: four_invol_tit.append(0) four_tit_feat = zip (get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen, four_invol_tit) ##### KEYWORD OR NOT ##### key_four_matx = [] for x in fourgrams_list: get_res = chk_keyword(x[0],key_fours) if get_res == 1: key_four_matx.append(1) else: key_four_matx.append(0) zip_four_all_feat = zip(get_zipfour_str, get_four_floats, four_tfidf_matx, four_fir_sen, four_invol_tit, key_four_matx) ######################################################### else: print 'Pass4-gram' zip_four_all_feat = '' uni_collection += zip_uni_all_feat bi_collection += zip_bi_all_feat tri_collection += zip_tri_all_feat four_collection += zip_four_all_feat total_unigram = len(uni_collection) ##UNIGRAM total_bigram = len(bi_collection) ##BIGRAM total_trigram = len(tri_collection) ##TRIGRAM total_fourgram = len(four_collection) ##FOURGRAM ####################### print "Document "+n_files+" has been processed." count += 1 ############################################ get_uni_vals = cal_bayes(uni_collection) get_bi_vals = cal_bayes(bi_collection) get_tri_vals = cal_bayes(tri_collection) get_four_vals = cal_bayes(four_collection) ##### GET TFIDF DISTRIBUTIONS ##### print '########## TFIDF DISTRIBUTIONS FOR N-GRAMS ##########' print dist_tfidf(get_uni_vals) print dist_tfidf(get_bi_vals) print dist_tfidf(get_tri_vals) print dist_tfidf(get_four_vals) ################################### ##### GET FIRST SENTENCE DISTRIBUTIONS ##### print '########## FIRST SEN. DISTRIBUTIONS FOR N-GRAMS ##########' print dist_firsen(get_uni_vals) print dist_firsen(get_bi_vals) print dist_firsen(get_tri_vals) print dist_firsen(get_four_vals) ############################################ ##### GET TITLE DISTRIBUTIONS ##### print '########## TITLE DISTRIBUTIONS FOR N-GRAMS ##########' print dist_title(get_uni_vals) print dist_title(get_bi_vals) print dist_title(get_tri_vals) print dist_title(get_four_vals) ################################### ##### PRODUCE TEXT ##### print '########## STORE INTO TEXT ##########' matrix_txt('uni_tf.txt',dist_tfidf(get_uni_vals)) matrix_txt('uni_fs.txt',dist_firsen(get_uni_vals)) matrix_txt('uni_tit.txt',dist_title(get_uni_vals)) matrix_txt('bi_tf.txt',dist_tfidf(get_bi_vals)) matrix_txt('bi_fs.txt',dist_firsen(get_bi_vals)) matrix_txt('bi_tit.txt',dist_title(get_bi_vals)) matrix_txt('tri_tf.txt',dist_tfidf(get_tri_vals)) matrix_txt('tri_fs.txt',dist_firsen(get_tri_vals)) matrix_txt('tri_tit.txt',dist_title(get_tri_vals)) matrix_txt('four_tf.txt',dist_tfidf(get_four_vals)) matrix_txt('four_fs.txt',dist_firsen(get_four_vals)) matrix_txt('four_tit.txt',dist_title(get_four_vals))