def nbc(query_file, result_file, dic, predict_fun):
    print("-"*60)
    print("Start Classification...")
    print("Classification using", predict_fun.__name__)
    if predict_fun == bernoulli:
        bernoulli.tmp_result = {"0":0, "1":0}
        for t in dic.once["term"]:
            tmp = bernoulli.tmp_result
            tmp["0"] += math.log(1-smoothing_bernoulli(dic, t, "0"))
            tmp["1"] += math.log(1-smoothing_bernoulli(dic, t, "1"))

    in_f =  open(query_file, "rt")
    out_f = open(result_file, "wt")

    out_f.write(in_f.readline())

    for i, line in enumerate(in_f):
        #if i%1000 == 0:
        #    print(i)

        id_, sentence, label = training.split_doc(line.strip())
        term_list = nlp.parser(sentence)
        new_label = classify(term_list, dic, predict_fun)

        if line.strip()[-1] in ("0", "1"):
            new_line = line.strip()[:-1] + new_label + "\n"
        else:
            new_line = line.strip() + "\t" + new_label + "\n"

        out_f.write(new_line)
    in_f.close()
    out_f.close()
    print("Finish Classification!")
예제 #2
0
파일: batch.py 프로젝트: rhmiller47/sentiPY
def getStrength(nlp, line):
    line = line.strip()
    if line is None:
        return 0
    line = line.encode("utf-8")
    loadSENTI(os.path.join(pwd, 'sentiment2.txt'))
    ## load the nonlinear sentiment phrase strengths
    nonLINEAR = loadLEXICON(os.path.join(pwd, 'nonlinear.txt'))
    ## nature language processing

    posed, parsed = parser(nlp, line)
    ## find opinion phrases and compute the sentiment strength
    seqs = []
    for pos, parse in zip(posed, parsed):
        phrases = findPHRASE(pos, parse)
        finalPH = filterPHRASE(phrases)

        phraseNUMBERseqs = calALL(nonLINEAR, os.path.join(pwd, 'advxxx.txt'),
                                  finalPH)
        #print "phraseNUMBERseqs: ", phraseNUMBERseqs
        seqs.append(phraseNUMBERseqs)

#print "seqs:",seqs

    senti = statistics("|".join(seqs))
    return senti
예제 #3
0
def get_keywords(querylist, delims, input_path, output_path, crawled_date):
    # nltk.data.path.append('/Users/lucypark/data/nltk_data')
    # TODO: 한글로 parsing 안되게 고치기
    # TODO: unicode 지원
    # TODO: check collocations

    queries = '_'.join(querylist)

    inp = input_filename(queries, crawled_date)
    filenames = get_filenames(input_path, inp)
    pprint(filenames)

    # TODO: why search _results[0]?
    with open(filenames[0], 'r') as f:
        docs = json.load(f)

    # TODO: d['title']로 출력했더니 같은 entry가 계속 나옴, 버그있는듯
    text = ''
    for d in docs:
        text = text + d['title']

    parsed = parser(text, delims)
    sanitized = [sanitizer(p) for p in parsed]
    # stems = stemmer(removed)
    removed = rm_stopwords(sanitized)
    filtered = filter(None, removed)

    keywords = word_counter(filtered, config.NKEYWORDS)

    '''
    d = dict()
    for k in keywords:
        d[k[0]] = k[1]

    sorted_d = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True)
    pprint(sorted_d[0:config.NPRINTWORDS])
    '''
    pprint(keywords[0:config.NPRINTWORDS])

    outp = output_filename(queries)

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    with open(output_path + outp, 'w') as f:
        # json.dump(d, f, indent=2)
        for k in keywords:
            json.dump(k, f)
            f.write(',\n')

    # TODO: keyword 결과에 tag까지 붙도록
    '''
    tags = dict(nltk.pos_tag(filtered))
    keyword_tags = [tags[k[0]] for k in keywords]
    kt = zip(keywords, keyword_tags)
    pprint(kt)
    return kt
    '''
    return keywords
예제 #4
0
def get_keywords(querylist, delims, input_path, output_path, crawled_date):
    # nltk.data.path.append('/Users/lucypark/data/nltk_data')
    # TODO: 한글로 parsing 안되게 고치기
    # TODO: unicode 지원
    # TODO: check collocations

    queries = '_'.join(querylist)

    inp = input_filename(queries, crawled_date)
    filenames = get_filenames(input_path, inp)
    pprint(filenames)

    # TODO: why search _results[0]?
    with open(filenames[0], 'r') as f:
        docs = json.load(f)

    # TODO: d['title']로 출력했더니 같은 entry가 계속 나옴, 버그있는듯
    text = ''
    for d in docs:
        text = text + d['title']

    parsed = parser(text, delims)
    sanitized = [sanitizer(p) for p in parsed]
    # stems = stemmer(removed)
    removed = rm_stopwords(sanitized)
    filtered = filter(None, removed)

    keywords = word_counter(filtered, config.NKEYWORDS)
    '''
    d = dict()
    for k in keywords:
        d[k[0]] = k[1]

    sorted_d = sorted(d.iteritems(), key=operator.itemgetter(1), reverse=True)
    pprint(sorted_d[0:config.NPRINTWORDS])
    '''
    pprint(keywords[0:config.NPRINTWORDS])

    outp = output_filename(queries)

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    with open(output_path + outp, 'w') as f:
        # json.dump(d, f, indent=2)
        for k in keywords:
            json.dump(k, f)
            f.write(',\n')

    # TODO: keyword 결과에 tag까지 붙도록
    '''
    tags = dict(nltk.pos_tag(filtered))
    keyword_tags = [tags[k[0]] for k in keywords]
    kt = zip(keywords, keyword_tags)
    pprint(kt)
    return kt
    '''
    return keywords
예제 #5
0
파일: senti.py 프로젝트: goog/sentiPY
def sentiFLY1(line):
    print 'current directory is ',os.getcwd()
    loadSENTI('./sentiment2.txt')
    ## nlp
    print "segment begins:"
    seged = seg(line)
    print "pos tagger begins:"
    posed = pos(seged)
    print "parser begins:"
    parsed = parser(seged)
    print "parser is over."

    print "seged",seged
    print "parsed",parsed

    ## find phrases and compute the sentiment polarity
    phrases = findPHRASE(posed,parsed)
    print ' '.join(phrases)
    finalPH = filterPHRASE(phrases)
    nonLINEAR  = loadLEXICON('./nonlinear.txt')
    phraseNUMBERseqs = calALL(nonLINEAR,'./advxxx.txt',finalPH)
    return statistics(phraseNUMBERseqs)
예제 #6
0
파일: senti.py 프로젝트: rhmiller47/sentiPY
def sentiFLY1(line):
    print 'current directory is ', os.getcwd()
    loadSENTI('./sentiment2.txt')
    ## nlp
    print "segment begins:"
    seged = seg(line)
    print "pos tagger begins:"
    posed = pos(seged)
    print "parser begins:"
    parsed = parser(seged)
    print "parser is over."

    print "seged", seged
    print "parsed", parsed

    ## find phrases and compute the sentiment polarity
    phrases = findPHRASE(posed, parsed)
    print ' '.join(phrases)
    finalPH = filterPHRASE(phrases)
    nonLINEAR = loadLEXICON('./nonlinear.txt')
    phraseNUMBERseqs = calALL(nonLINEAR, './advxxx.txt', finalPH)
    return statistics(phraseNUMBERseqs)
예제 #7
0
def train(file_name):
    print("-"*60)
    print("Start training...")

    start_time = timeit.default_timer()
    dic = dictionary.Dictionary()

    with open(file_name) as f:       
        next(f)
        
        for i, line in enumerate(f):
            #if i%1000 == 0:
            #    print(i)

            id_, sentence, label = split_doc(line.strip())                
            
            dic.increment_label(label)
            term_list = nlp.parser(sentence)
            
            already = set()
            for t in term_list:
                if t not in already:
                    dic.add_term_repeat(t, label)
                    dic.add_term_once(t, label)
                    already.add(t)
                else:
                    dic.add_term_repeat(t, label)
                    
    end_time = timeit.default_timer()
    
    print("End training!")
    print("Training of '{}'".format(file_name))
    print("Training time: {:.3} sec".format(end_time-start_time))
    print("Total {} docs".format(dic.total_doc()))
    print("Good doc:", dic.doc_frequency("1"))
    print("Bad doc:", dic.doc_frequency("0"))
    print("Number of terms:", len(dic.once["term"]))  
    return dic
예제 #8
0
def createEvent():
    eid = "%.3f" % (time.time())
    host = request.form['host']
    message = request.form['message']
    event_time = request.form['time']
    location = request.form['location']
    
    (tokens, actions_list) = parser(message)
    if len(actions_list) == 0:
        actions_list.append(message)

    event_created = Event(eid = eid, host = host, message = actions_list[0] , time = event_time, location = location)

    try:
        db.session.add(event_created)
        db.session.commit()
        print ("sdiuhqwiudhiwqdhiuahd")
    except:
        err_msg = "Create Event Failed!"
        context = dict(data = err_msg)
        return render_template("index.html", **context)
    print ("Create Event Succeeded!")
    return redirect(url_for('index'))
예제 #9
0
def createEvent():
    eid = "%.3f" % (time.time())
    host = request.form['host']
    message = request.form['message']
    event_time = request.form['time']
    location = request.form['location']
    key = "&key=AIzaSyAOd0yLa2PRCmldYBjXYLF5eZXhzaCv8jE"
    res = requests.get(
        'https://maps.googleapis.com/maps/api/geocode/json?address=' +
        location.replace(" ", "+") + key)
    res = res.json()
    lat = "40.8075355"
    lon = "-73.9625727"
    if len(res["results"]) != 0:
        lat = res["results"][0]["geometry"]["location"]["lat"]
        lon = res["results"][0]["geometry"]["location"]["lng"]
    (tokens, actions_list) = parser(message)
    if len(actions_list) == 0:
        actions_list.append(message)

    event_created = Event(eid=eid,
                          host=host,
                          message=actions_list[0],
                          time=event_time,
                          location=location,
                          lat=lat,
                          lon=lon)

    try:
        db.session.add(event_created)
        db.session.commit()
    except:
        err_msg = "Create Event Failed!"
        context = dict(data=err_msg)
        return render_template("index.html", **context)
    print("Create Event Succeeded!")
    return redirect(url_for('index'))
예제 #10
0
파일: batch.py 프로젝트: goog/sentiPY
def getStrength(nlp,line):
	line = line.strip()
	if line is None:
		return 0
	line = line.encode("utf-8")
	loadSENTI(os.path.join(pwd,'sentiment2.txt'))
	## load the nonlinear sentiment phrase strengths
	nonLINEAR = loadLEXICON(os.path.join(pwd,'nonlinear.txt'))
	## nature language processing

	posed,parsed = parser(nlp,line)
	## find opinion phrases and compute the sentiment strength
	seqs = []
	for pos,parse in zip(posed,parsed):
            phrases = findPHRASE(pos,parse)
	    finalPH = filterPHRASE(phrases)
            
	    phraseNUMBERseqs = calALL(nonLINEAR,os.path.join(pwd,'advxxx.txt'),finalPH)
	    #print "phraseNUMBERseqs: ", phraseNUMBERseqs
            seqs.append(phraseNUMBERseqs)
        #print "seqs:",seqs
	
	senti = statistics("|".join(seqs))
	return senti