def __init__(self, dictionary_path=None): if dictionary_path is None: self.dictionary = None else: self.dictionary = t4k.UnigramDictionary() self.dictionary.load(dictionary_path)
def generate_top_random_task(out_path, num_top, num_rand): # Open the file that we'll write to out_file = open(out_path, 'w') # First, figure out what words have already been annotated results = t4k.CrowdflowerResults(RESULTS_PATHS) already_annotated = set([row['data']['token'] for row in results]) # Now get a sorted dictionary for gigaword dictionary = t4k.UnigramDictionary() dictionary.load(DICTIONARY_PATH) # Get the top ``num_top`` words that haven't yet been annotated top_words = set() for token in dictionary.get_token_list(): # Skip the UNK token if token is 'UNK': continue # Add words that haven't been annotated before if token not in already_annotated: top_words.add(token) # Stop once we have enough words if len(top_words) >= num_top: break # Now, get ``num_rand`` uniformly randomly selected words that have not # been annotated. Candidates include any non 'UNK' word that hasn't been # annotated before. candidates = set(list(dictionary.get_token_list())[1:]) - already_annotated rand_words = set(random.sample(candidates, num_rand)) # We're almost ready to start writing out to file. Let's make a list of # all the rows so that we can randomly shuffle them before writing to file. rows = [] for word in rand_words | top_words: if word in rand_words and word in top_words: rows.append((word, 'top:rand')) elif word in rand_words: rows.append((word, 'rand')) else: rows.append((word, 'top')) random.shuffle(rows) # Write out the headings, then write the rows writer = csv.writer(out_file) writer.writerow(('token', 'source')) writer.writerows(rows)
def assert_feature_like_on_disc(self, feature_accumulator, path): # Test that the dictionary extracted is the same as the ones on disk expected_dictionary = t4k.UnigramDictionary() expected_dictionary.load(os.path.join(path, 'dictionary')) self.assertEqual( dict(feature_accumulator.dictionary.get_frequency_list()), dict(expected_dictionary.get_frequency_list()) ) # Test that the features extracted are the same as the ones on disk for feature_type in ['dependency', 'baseline', 'hand_picked']: expected = json.loads( open(os.path.join(path, feature_type + '.json')).read()) self.assertDictEqual( getattr(feature_accumulator, feature_type), expected )
def get_top_words(): """ Get the k most common words in gigawords for which all words were annotated and k is as big as it can be. """ all_annotated_words = get_all_annotated_words() dictionary = t4k.UnigramDictionary() dictionary.load(DICTIONARY_DIR) top_words = [] for i, token in enumerate(dictionary.get_token_list()): if token == 'UNK': continue if token in all_annotated_words: top_words.append(token) else: break return top_words
def generate_random_candidates(num_to_generate, out_path, exclude=set()): # Open a path that we want to write to out_f = open(out_path, 'w') # Open the dictionary of words seen in the corpus dictionary_path = os.path.join(BEST_WORDNET_ONLY_FEATURES_PATH, 'dictionary') dictionary = t4k.UnigramDictionary() dictionary.load(dictionary_path) # Uniformly randomly sample from it samples = set() while len(samples) < num_to_generate: token = random.choice(dictionary.token_map.tokens) if token != 'UNK' and token not in exclude and token not in samples: samples.add(token) out_f.write('\n'.join(samples))
def get_dictionary(path): dictionary = t4k.UnigramDictionary() dictionary.load(path) return dictionary