"""
    h = HTMLParser.HTMLParser()
    #	start_time = time.time()
    f1 = open(sys.argv[1], "r")
    l1 = f1.readlines()
    tags = []
    for line in l1:
        tags.append(line[:-1])
    obj = AssignTags(tags)
    input_dir = sys.argv[2]  # input directory
    output_dir = sys.argv[3]  # output directory
    for f in listdir(input_dir):
        fname = join(input_dir, f)
        f2 = codecs.open(fname, "r", encoding="utf-8")
        question = f2.readline()
        question = question.encode('ascii', 'ignore')
        question = h.unescape(question)
        description = f2.read()
        description = description.encode('ascii', 'ignore')
        description = h.unescape(description)
        CONTENT = question + "\n" + question + "\n" + description
        np1 = extractNP(CONTENT)
        keywords = extractKeywords(np1)
        tags = obj.getTags(keywords)
        fname = join(output_dir, f)
        f3 = open(fname, "w")
        for tag in tags:
            f3.write(tag + "\n")
        f2.close()
        f3.close()
Пример #2
0
import sys
from simnpchunk_rake import extractNP, extractKeywords
from nltk.collocations import *



qf = open(sys.argv[1])
df = open(sys.argv[2])
question = qf.read()
question= question.translate(None,"!()")
description = df.read()
description = description.translate(None,"!()")	
Q_NP = extractNP(question)
D_NP = extractNP(description)
F_NP = Q_NP + Q_NP + D_NP
keywords = extractKeywords(F_NP)

#obj = RakeKeywordExtractor()
#keywords = obj.extract(question,description,True)
wordlist = []
for key in keywords:
	temp = key[0].split(' ')
	for word in temp:
		wordlist.append(word)

print "TOP BIGRAMS"	
bigram_measures = nltk.collocations.BigramAssocMeasures()    
trigram_measures = nltk.collocations.TrigramAssocMeasures() 
bigram_finder = BigramCollocationFinder.from_words(wordlist);              #CREATE FINDER OBJECT 
scores = bigram_finder.score_ngrams( bigram_measures.raw_freq );        # WHAT MEASURE TO USE IS MENTIONED BY BIGRAM_MEASURE.RAW_FREQ
for i in range(0,50):
Пример #3
0
#	start_time = time.time()
	f1 = open(sys.argv[1],"r")
	l1 = f1.readlines()
	tags=[]
	for line in l1:
		tags.append(line[:-1])
	obj = AssignTags(tags)
	input_dir = sys.argv[2]	# input directory
	output_dir= sys.argv[3]	# output directory
	for f in listdir(input_dir):
		fname = join(input_dir,f)
		f2 = codecs.open(fname,"r",encoding="utf-8")
		question = f2.readline()
		question = question.encode('ascii','ignore')
		question = h.unescape(question)
		description = f2.read()
		description = description.encode('ascii','ignore')
		description = h.unescape(description)
		CONTENT  = question + "\n" + question+ "\n" + description
		np1 = extractNP(CONTENT)
		keywords = extractKeywords(np1)
		tags=obj.getTags(keywords)
		fname = join(output_dir,f)
		f3 = open(fname,"w")
		for tag in tags:
			f3.write(tag+"\n")
		f2.close()
		f3.close() 

		
import nltk
import sys
import sys
from simnpchunk_rake import extractNP, extractKeywords
from nltk.collocations import *

qf = open(sys.argv[1])
df = open(sys.argv[2])
question = qf.read()
question = question.translate(None, "!()")
description = df.read()
description = description.translate(None, "!()")
Q_NP = extractNP(question)
D_NP = extractNP(description)
F_NP = Q_NP + Q_NP + D_NP
keywords = extractKeywords(F_NP)

#obj = RakeKeywordExtractor()
#keywords = obj.extract(question,description,True)
wordlist = []
for key in keywords:
    temp = key[0].split(' ')
    for word in temp:
        wordlist.append(word)

print "TOP BIGRAMS"
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_words(wordlist)
#CREATE FINDER OBJECT
scores = bigram_finder.score_ngrams(bigram_measures.raw_freq)