def main(argv): """ Creates n-grams from the file New Years Resolution_merged.tsv. Output in JSON format """ if validate_argv(argv) is False: print "Usage: tokenizeTweets.py <file name> <file directory> <n (for n gram)>" sys.exit() file_name = argv[0] input_directory_name = argv[1] n_for_ngrams = int(argv[2]) input_path = fp.get_file_path(file_name, input_directory_name) tweets = extract_tweets(input_path) tweets_deduped = dedupe_and_tokenize(tweets) # for creating an ngram dictionary ngrams = dg.create_ngrams(tweets_deduped, n_for_ngrams) ngram_dict = dg.create_ngram_dict(ngrams) output_path = fp.set_output_file_path('New Years Resolution_ngram_' + str(n_for_ngrams) + '.json', 'ngrams') output_ngram(ngram_dict, output_path) # for creating a list of tokens. Removing the words "New Years Resolution" as well. tokens = break_down_sentences(tweets_deduped) tokens_cleaned = remove_tokens(tokens, ['new', 'years', 'resolution', ':']) output_path2 = fp.set_output_file_path('New Years Resolution_tokens.tsv', 'tokens') output_tokens(tokens_cleaned, output_path2)
def main(argv): """ Merge files of the format <search_term>_####.tsv" in the data_raw directory and outputs into the "merged" directory """ if validate_argv(argv) is False: print "Usage: mergeFiles.py <search_term>" sys.exit() input_directory_name = 'data_raw' search_term = argv[0] output_file_name = search_term + '_merged.tsv' output_directory_name = 'merged' output_path = fp.set_output_file_path(output_file_name, output_directory_name) output = open(output_path, 'a') for h1 in range(3): for h2 in range(10): for m1 in range(6): for m2 in range(10): file_name = search_term + '_' + str(h1) + str(h2) + str(m1) + str(m2) + '.tsv' file_path = fp.get_file_path(file_name, input_directory_name) if fp.filename_exists(file_path): file = open(file_path, 'r') file.next() for line in file: output.write(line) file.close() output.close()
def validate_argv(argv): """ List[String,...] -> Boolean Takes a command line argument and ensures that a) there are only 2 arguments and b) 1st & 2nd argument is a valid path and file. False if any of the above are not true """ if len(argv) != 2: return False file_name = argv[0] input_directory_name = argv[1] file_path = fp.get_file_path(file_name, input_directory_name) if fp.filename_exists(file_path) is False: print "File doesn't exist" return False return True
def main(argv): """ Uses a trigram Markov chain (found on https://gist.github.com/agiliq/131679), randomly generate sentences from token list """ if validate_argv(argv) is False: print "Usage: tokenizeTweets.py <file name> <file directory>" sys.exit() file_name = argv[0] input_directory_name = argv[1] path = fp.get_file_path(file_name, input_directory_name) input = open(path, 'r') text_markov = markov.Markov(input) input.close() print text_markov.generate_markov_text()
def validate_argv(argv): """ List[String,...] -> Boolean Takes a command line argument and ensures that a) there are only 3 arguments and b) 1st & 2nd argument is a valid path and file c) 3rd argument is a positive integer. Returns False if any of the above are not true """ if len(argv) != 3: return False file_name = argv[0] input_directory_name = argv[1] n = argv[2] file_path = fp.get_file_path(file_name, input_directory_name) if fp.filename_exists(file_path) is False: print "File doesn't exist" return False try: n_int = int(n) if n_int < 2 or n_int > 5: print "n must be greater than 1 or less than 6" return False except: return False print "n must be an integer" return True
def main(argv): """ Creates a randomly generated tweet based on a single initial keyword. Example: createTweets.py New\ Years\ Resolution_ngram_3.json ngrams stop """ if validate_argv(argv) is False: print "Usage: tokenizeTweets.py <file name> <file directory> <start term>" sys.exit() file_name = argv[0] input_directory_name = argv[1] search_term = argv[2].lower() input_path = fp.get_file_path(file_name, input_directory_name) ngram_dictionary = extract_dictionary(input_path) if validate_search_term(search_term, ngram_dictionary) is False: print "'" + search_term + "' does not exist in tweet dictionary" sys.exit() sentence_list = create_sentence(search_term, ngram_dictionary) count = 0 while len(sentence_list) < 6 and count < 5: # 5 attempts to create a sentence longer than 6 words (including '$') sentence_list = create_sentence(search_term, ngram_dictionary) count = count + 1 if '$' in sentence_list: sentence_list.remove('$') sentence = ' '.join(sentence_list) print sentence