def __init__(self, bigram_file): bigWordsFile = open("bigwordlist.txt", "r") NUMBER_OF_WORDS = 75000 lexicon = [] for i in range(NUMBER_OF_WORDS): line = bigWordsFile.readline() wordOnly = line.split()[0] wordOnly.replace(" ", "") if len(wordOnly) >= 1: lexicon.append(wordOnly) li = LexiconImprover(lexicon) self.lexImprover = li.improveLexicon() self.pos_bigram_combination = [] self.bigram_dict = {} self.best_bigram = [] self.max_prob = 0 # Get the unique words keyset = set([]) line_count = 0 for line in bigram_file: line = line.replace("\n", "") line = line.replace("\t", " ") line = line.split(" ") if self.bigram_dict.has_key(line[0]): self.bigram_dict[line[0]].append(line) else: self.bigram_dict[line[0]] = [] self.bigram_dict[line[0]].append(line) line_count += 1
NUMBER_OF_WORDS = 75000 lexicon = [] for i in range(NUMBER_OF_WORDS): line = bigWordsFile.readline() wordOnly = line.split()[0] wordOnly.replace(" " , "") if len(wordOnly) >= 1: lexicon.append(wordOnly) #improve the lexicon by removing infrequent words that are relatively few letters in length lexImprover = LexiconImprover(lexicon) lexicon = lexImprover.improveLexicon() allHashTags = hashTagFile.readlines() sampleHashTagsWithAnswers = {'#30secondstomars' : ['30', 'seconds', 'to', 'mars'], '#americanidol' : ['american', 'idol'], '#hurricaneike' : ['hurricane', 'ike'], '#celebraterandommilestones' : ['celebrate' , 'random' , 'milestones'], '#entrepreneurship' : ['entrepeneurship'], '#firstdayofkindergarten' : ['first', 'day' , 'of' , 'kindergarten']} sampleHashTags = []