def load_sentences(dataset_path): """Loads the dataset as a list of sentences. Params: - dataset_path (str): Path to dataset Returns: - sentences (pyspark.rdd.RDD): RDD containing sentences """ records = wordcount.load_records(dataset_path)\ .map(lambda record: Row(record_id=record[0], **record[1].asDict())) wordcount.rdd_show(records, "=====Records=====") # Filter out sentences shorter than 20 characters. Then, # add a unique ID to each record. Then, # make the ID the first element in the record, so it can be used as a key sentences = records.flatMap(record_to_sentences)\ .filter(lambda sentence: len(sentence['Sentences_t']) > 19)\ .zipWithUniqueId()\ .map(lambda record: (record[1], record[0])) wordcount.rdd_show(sentences, "=====Loaded Sentences=====") return sentences
"""Converts the dataset RDD to a dataFrame and saves it as multiple JSON files. Params: - dataset (pyspark.sql.DataFrame): The dataset containing the id, record, preprocessed record, and both feature sets for each record """ dataset.show() dataset.write.json(filename, mode="overwrite") # End of save_dataset_as_dataframe() if __name__ == "__main__": args = parse_arguments() if args.sentences: sentences = load_sentences(args.file) with open("bag_of_words_labels.json", "r") as bow_file: bag_of_words_labels = json.load(bow_file) preprocessed_contents = preprocess_records_keep_fields(sentences) else: records = wordcount.load_records(args.file, False) preprocessed_contents = preprocess_records_keep_fields(records) if os.path.isfile("bag_of_words_labels.json"): print("Loading bag of words labels from file") with open("bag_of_words_labels.json", "r") as bow_file: bag_of_words_labels = json.load(bow_file) else: bag_of_words_labels = get_bag_of_words_labels(preprocessed_contents, args) feature_sets = preprocessed_contents.map(lambda contents: make_feature_sets(contents, bag_of_words_labels)) dataset = feature_sets.toDF() save_dataset_as_dataframe(dataset, args.output)
def pos_tag_verbs(records): ''' pos tags verbs and gets most important ones Params: - records (list<str>): the contents of each record stored as a string in a list i.e. a list of strings Return: - important_words (list): a list of important verbs ''' tagged_records = pos_parser(records) verbs = verb_tagger(tagged_records) tfidf_scores = pos_tfidf_scores(verbs) important_words = tfidf.extract_important_words(tfidf_scores, len(verbs)) return important_words if __name__ == "__main__": args = wordcount.parse_arguments() records = wordcount.load_records(args.file) #dictionary records = records.collect() contents = list(map(lambda record: record[1][constants.TEXT], records)) #puts records into a list from dictionary pos_tagged_records = pos_parser(contents) nv_tuple = pos_nv_tagger(pos_tagged_records) print("MOST IMPORTANT NOUNS:") print(pos_tfidf(nv_tuple[0])) print("MOST IMPORTANT VERBS:") print(pos_tfidf(nv_tuple[1]))