def nbc(query_file, result_file, dic, predict_fun): print("-"*60) print("Start Classification...") print("Classification using", predict_fun.__name__) if predict_fun == bernoulli: bernoulli.tmp_result = {"0":0, "1":0} for t in dic.once["term"]: tmp = bernoulli.tmp_result tmp["0"] += math.log(1-smoothing_bernoulli(dic, t, "0")) tmp["1"] += math.log(1-smoothing_bernoulli(dic, t, "1")) in_f = open(query_file, "rt") out_f = open(result_file, "wt") out_f.write(in_f.readline()) for i, line in enumerate(in_f): #if i%1000 == 0: # print(i) id_, sentence, label = training.split_doc(line.strip()) term_list = nlp.parser(sentence) new_label = classify(term_list, dic, predict_fun) if line.strip()[-1] in ("0", "1"): new_line = line.strip()[:-1] + new_label + "\n" else: new_line = line.strip() + "\t" + new_label + "\n" out_f.write(new_line) in_f.close() out_f.close() print("Finish Classification!")
def getStrength(nlp, line): line = line.strip() if line is None: return 0 line = line.encode("utf-8") loadSENTI(os.path.join(pwd, 'sentiment2.txt')) ## load the nonlinear sentiment phrase strengths nonLINEAR = loadLEXICON(os.path.join(pwd, 'nonlinear.txt')) ## nature language processing posed, parsed = parser(nlp, line) ## find opinion phrases and compute the sentiment strength seqs = [] for pos, parse in zip(posed, parsed): phrases = findPHRASE(pos, parse) finalPH = filterPHRASE(phrases) phraseNUMBERseqs = calALL(nonLINEAR, os.path.join(pwd, 'advxxx.txt'), finalPH) #print "phraseNUMBERseqs: ", phraseNUMBERseqs seqs.append(phraseNUMBERseqs) #print "seqs:",seqs senti = statistics("|".join(seqs)) return senti
def get_keywords(querylist, delims, input_path, output_path, crawled_date): # nltk.data.path.append('/Users/lucypark/data/nltk_data') # TODO: 한글로 parsing 안되게 고치기 # TODO: unicode 지원 # TODO: check collocations queries = '_'.join(querylist) inp = input_filename(queries, crawled_date) filenames = get_filenames(input_path, inp) pprint(filenames) # TODO: why search _results[0]? with open(filenames[0], 'r') as f: docs = json.load(f) # TODO: d['title']로 출력했더니 같은 entry가 계속 나옴, 버그있는듯 text = '' for d in docs: text = text + d['title'] parsed = parser(text, delims) sanitized = [sanitizer(p) for p in parsed] # stems = stemmer(removed) removed = rm_stopwords(sanitized) filtered = filter(None, removed) keywords = word_counter(filtered, config.NKEYWORDS) ''' d = dict() for k in keywords: d[k[0]] = k[1] sorted_d = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True) pprint(sorted_d[0:config.NPRINTWORDS]) ''' pprint(keywords[0:config.NPRINTWORDS]) outp = output_filename(queries) if not os.path.exists(output_path): os.makedirs(output_path) with open(output_path + outp, 'w') as f: # json.dump(d, f, indent=2) for k in keywords: json.dump(k, f) f.write(',\n') # TODO: keyword 결과에 tag까지 붙도록 ''' tags = dict(nltk.pos_tag(filtered)) keyword_tags = [tags[k[0]] for k in keywords] kt = zip(keywords, keyword_tags) pprint(kt) return kt ''' return keywords
def sentiFLY1(line): print 'current directory is ',os.getcwd() loadSENTI('./sentiment2.txt') ## nlp print "segment begins:" seged = seg(line) print "pos tagger begins:" posed = pos(seged) print "parser begins:" parsed = parser(seged) print "parser is over." print "seged",seged print "parsed",parsed ## find phrases and compute the sentiment polarity phrases = findPHRASE(posed,parsed) print ' '.join(phrases) finalPH = filterPHRASE(phrases) nonLINEAR = loadLEXICON('./nonlinear.txt') phraseNUMBERseqs = calALL(nonLINEAR,'./advxxx.txt',finalPH) return statistics(phraseNUMBERseqs)
def sentiFLY1(line): print 'current directory is ', os.getcwd() loadSENTI('./sentiment2.txt') ## nlp print "segment begins:" seged = seg(line) print "pos tagger begins:" posed = pos(seged) print "parser begins:" parsed = parser(seged) print "parser is over." print "seged", seged print "parsed", parsed ## find phrases and compute the sentiment polarity phrases = findPHRASE(posed, parsed) print ' '.join(phrases) finalPH = filterPHRASE(phrases) nonLINEAR = loadLEXICON('./nonlinear.txt') phraseNUMBERseqs = calALL(nonLINEAR, './advxxx.txt', finalPH) return statistics(phraseNUMBERseqs)
def train(file_name): print("-"*60) print("Start training...") start_time = timeit.default_timer() dic = dictionary.Dictionary() with open(file_name) as f: next(f) for i, line in enumerate(f): #if i%1000 == 0: # print(i) id_, sentence, label = split_doc(line.strip()) dic.increment_label(label) term_list = nlp.parser(sentence) already = set() for t in term_list: if t not in already: dic.add_term_repeat(t, label) dic.add_term_once(t, label) already.add(t) else: dic.add_term_repeat(t, label) end_time = timeit.default_timer() print("End training!") print("Training of '{}'".format(file_name)) print("Training time: {:.3} sec".format(end_time-start_time)) print("Total {} docs".format(dic.total_doc())) print("Good doc:", dic.doc_frequency("1")) print("Bad doc:", dic.doc_frequency("0")) print("Number of terms:", len(dic.once["term"])) return dic
def createEvent(): eid = "%.3f" % (time.time()) host = request.form['host'] message = request.form['message'] event_time = request.form['time'] location = request.form['location'] (tokens, actions_list) = parser(message) if len(actions_list) == 0: actions_list.append(message) event_created = Event(eid = eid, host = host, message = actions_list[0] , time = event_time, location = location) try: db.session.add(event_created) db.session.commit() print ("sdiuhqwiudhiwqdhiuahd") except: err_msg = "Create Event Failed!" context = dict(data = err_msg) return render_template("index.html", **context) print ("Create Event Succeeded!") return redirect(url_for('index'))
def createEvent(): eid = "%.3f" % (time.time()) host = request.form['host'] message = request.form['message'] event_time = request.form['time'] location = request.form['location'] key = "&key=AIzaSyAOd0yLa2PRCmldYBjXYLF5eZXhzaCv8jE" res = requests.get( 'https://maps.googleapis.com/maps/api/geocode/json?address=' + location.replace(" ", "+") + key) res = res.json() lat = "40.8075355" lon = "-73.9625727" if len(res["results"]) != 0: lat = res["results"][0]["geometry"]["location"]["lat"] lon = res["results"][0]["geometry"]["location"]["lng"] (tokens, actions_list) = parser(message) if len(actions_list) == 0: actions_list.append(message) event_created = Event(eid=eid, host=host, message=actions_list[0], time=event_time, location=location, lat=lat, lon=lon) try: db.session.add(event_created) db.session.commit() except: err_msg = "Create Event Failed!" context = dict(data=err_msg) return render_template("index.html", **context) print("Create Event Succeeded!") return redirect(url_for('index'))
def getStrength(nlp,line): line = line.strip() if line is None: return 0 line = line.encode("utf-8") loadSENTI(os.path.join(pwd,'sentiment2.txt')) ## load the nonlinear sentiment phrase strengths nonLINEAR = loadLEXICON(os.path.join(pwd,'nonlinear.txt')) ## nature language processing posed,parsed = parser(nlp,line) ## find opinion phrases and compute the sentiment strength seqs = [] for pos,parse in zip(posed,parsed): phrases = findPHRASE(pos,parse) finalPH = filterPHRASE(phrases) phraseNUMBERseqs = calALL(nonLINEAR,os.path.join(pwd,'advxxx.txt'),finalPH) #print "phraseNUMBERseqs: ", phraseNUMBERseqs seqs.append(phraseNUMBERseqs) #print "seqs:",seqs senti = statistics("|".join(seqs)) return senti