예제 #1
0
 def enumerate(row: ps.Row, enumerator: FragmentReactionSliceEnumerator, max_cuts: int) -> List[ps.Row]:
     attachments = AttachmentPoints()
     fields = row.split("\t")
     smiles = fields[0]
     mol = uc.to_mol(smiles)
     out_rows = []
     if mol:
         for sliced_mol in enumerator.enumerate(mol, cuts=max_cuts):
             row_dict = {
                 DataframeColumnsEnum.SCAFFOLDS:
                     attachments.remove_attachment_point_numbers(sliced_mol.scaffold_smiles),
                 DataframeColumnsEnum.DECORATIONS: sliced_mol.decorations_smiles,
                 DataframeColumnsEnum.ORIGINAL: sliced_mol.original_smiles,
                 DataframeColumnsEnum.MAX_CUTS: max_cuts}
             out_rows.append(ps.Row(**row_dict))
     return out_rows
예제 #2
0
 def collect_failures(
         self, row: ps.Row,
         enumerator: FailingReactionsEnumerator) -> List[ps.Row]:
     fields = row.split("\t")
     smiles = fields[0]
     mol = uc.to_mol(smiles)
     out_rows = []
     if mol:
         for failed_reaction in enumerator.enumerate(
                 mol, failures_limit=self.configuration.failures_limit):
             row_dict = {
                 self._columns.REACTION: failed_reaction.reaction_smirks,
                 self._columns.ORIGINAL: failed_reaction.molecule_smiles
             }
             print("found failed reaction")
             out_rows.append(ps.Row(**row_dict))
             if self.configuration.failures_limit <= len(out_rows):
                 break
     return out_rows
    return previous_row[-1]


movie_data = sys.argv[1]
user_data = sys.argv[2]
output = sys.argv[3]

userId = 112132212
movie = Row("id", "movieName")

movie_table = sc.textFile(movie_data + str("/movies.dat"))
rating_table = sc.textFile(movie_data + str("/ratings.dat"))
user_data_table = sc.textFile(movie_data + str("/users.dat"))
new_user = sc.textFile(user_data)

movieRDD = movie_table.map(lambda movie: movie.split("::"))

ratingDF = (rating_table.map(lambda rating: rating.split("::")).map(
    lambda rate: (int(rate[0]), int(rate[1]), float(rate[2]))).map(
        lambda (uid, mid, rate): Rating(uid, mid, rate))).toDF()

newUserRDD = new_user.map(lambda movie: movie.split(" ", 1))

joinRDD = movieRDD.cartesian(newUserRDD)
joinRDD = (joinRDD.map(lambda (movie, umovie): (movie[0], movie[1], umovie[
    0], umovie[1])).map(lambda (id, movie, urate, umovie): (umovie, (
        id, urate, levenshtein(movie, umovie)))).reduceByKey(
            lambda x1, x2: min(x1, x2, key=lambda x: x[-1])))

userMovie = (
    joinRDD.map(lambda (key, value): (userId, value[0], value[1])).map(