""" h = HTMLParser.HTMLParser() # start_time = time.time() f1 = open(sys.argv[1], "r") l1 = f1.readlines() tags = [] for line in l1: tags.append(line[:-1]) obj = AssignTags(tags) input_dir = sys.argv[2] # input directory output_dir = sys.argv[3] # output directory for f in listdir(input_dir): fname = join(input_dir, f) f2 = codecs.open(fname, "r", encoding="utf-8") question = f2.readline() question = question.encode('ascii', 'ignore') question = h.unescape(question) description = f2.read() description = description.encode('ascii', 'ignore') description = h.unescape(description) CONTENT = question + "\n" + question + "\n" + description np1 = extractNP(CONTENT) keywords = extractKeywords(np1) tags = obj.getTags(keywords) fname = join(output_dir, f) f3 = open(fname, "w") for tag in tags: f3.write(tag + "\n") f2.close() f3.close()
import sys from simnpchunk_rake import extractNP, extractKeywords from nltk.collocations import * qf = open(sys.argv[1]) df = open(sys.argv[2]) question = qf.read() question= question.translate(None,"!()") description = df.read() description = description.translate(None,"!()") Q_NP = extractNP(question) D_NP = extractNP(description) F_NP = Q_NP + Q_NP + D_NP keywords = extractKeywords(F_NP) #obj = RakeKeywordExtractor() #keywords = obj.extract(question,description,True) wordlist = [] for key in keywords: temp = key[0].split(' ') for word in temp: wordlist.append(word) print "TOP BIGRAMS" bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() bigram_finder = BigramCollocationFinder.from_words(wordlist); #CREATE FINDER OBJECT scores = bigram_finder.score_ngrams( bigram_measures.raw_freq ); # WHAT MEASURE TO USE IS MENTIONED BY BIGRAM_MEASURE.RAW_FREQ for i in range(0,50):
# start_time = time.time() f1 = open(sys.argv[1],"r") l1 = f1.readlines() tags=[] for line in l1: tags.append(line[:-1]) obj = AssignTags(tags) input_dir = sys.argv[2] # input directory output_dir= sys.argv[3] # output directory for f in listdir(input_dir): fname = join(input_dir,f) f2 = codecs.open(fname,"r",encoding="utf-8") question = f2.readline() question = question.encode('ascii','ignore') question = h.unescape(question) description = f2.read() description = description.encode('ascii','ignore') description = h.unescape(description) CONTENT = question + "\n" + question+ "\n" + description np1 = extractNP(CONTENT) keywords = extractKeywords(np1) tags=obj.getTags(keywords) fname = join(output_dir,f) f3 = open(fname,"w") for tag in tags: f3.write(tag+"\n") f2.close() f3.close()
import nltk import sys import sys from simnpchunk_rake import extractNP, extractKeywords from nltk.collocations import * qf = open(sys.argv[1]) df = open(sys.argv[2]) question = qf.read() question = question.translate(None, "!()") description = df.read() description = description.translate(None, "!()") Q_NP = extractNP(question) D_NP = extractNP(description) F_NP = Q_NP + Q_NP + D_NP keywords = extractKeywords(F_NP) #obj = RakeKeywordExtractor() #keywords = obj.extract(question,description,True) wordlist = [] for key in keywords: temp = key[0].split(' ') for word in temp: wordlist.append(word) print "TOP BIGRAMS" bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() bigram_finder = BigramCollocationFinder.from_words(wordlist) #CREATE FINDER OBJECT scores = bigram_finder.score_ngrams(bigram_measures.raw_freq)