예제 #1
0
 def test_fpgrowth(self):
     data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
     rdd = self.sc.parallelize(data, 2)
     model1 = FPGrowth.train(rdd, 0.6, 2)
     # use default data partition number when numPartitions is not specified
     model2 = FPGrowth.train(rdd, 0.6)
     self.assertEqual(sorted(model1.freqItemsets().collect()),
                      sorted(model2.freqItemsets().collect()))
예제 #2
0
 def test_fpgrowth(self):
     data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
     rdd = self.sc.parallelize(data, 2)
     model1 = FPGrowth.train(rdd, 0.6, 2)
     # use default data partition number when numPartitions is not specified
     model2 = FPGrowth.train(rdd, 0.6)
     self.assertEqual(sorted(model1.freqItemsets().collect()),
                      sorted(model2.freqItemsets().collect()))
예제 #3
0
    def prepare_fpgrowth_data(self):
        tran_df = self.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
                                                                                                 'MER_CAT_CD') \
            .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")

        result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()

        def m(x):
            k = x[0]
            l = list(x[1])

            v = set()
            for i in l:
                v.add(i[0])

            return set(v)

        result = result.map(m)
        for i in result.take(10):
            print(i)

        model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
        result = model.freqItemsets().collect()
        for r in result:
            print(r)
예제 #4
0
def FPGrowthRDD(transactionsRDD, minSupport=0.2, numPartitions=10):
    '''
    perform the FPGrowth algorithm

    '''
    model = FPGrowth.train(transactionsRDD, minSupport=0.2, numPartitions=10)
    return  model.freqItemsets()
예제 #5
0
def get_most():
    print("get most")
    my_spark = pyspark.sql.SparkSession \
        .builder \
        .appName("RESTAPI_most_frequent") \
        .master("local[2]") \
        .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/conception.factures") \
        .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/conception.factures") \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.1')\
        .config("spark.executor.memory", "1G") \
        .config("spark.driver.memory", "5G") \
        .getOrCreate()\

    df = my_spark.read.format("com.mongodb.spark.sql.DefaultSource").load()

    df.show()

    transactions = df.groupBy("_id") \
        .agg(functions.collect_list("articles.product_name").alias("name")) \
        .rdd \
        .flatMap(lambda x: x.name)

    transactions.collect()

    model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
    result = model.freqItemsets().collect()

    return json.dumps(result)
    def _train_fp_growth_model(cls, data_store, eco_to_package_topic_dict,
                               min_support_count, additional_path,
                               fp_num_partition):
        sc = SparkContext()
        manifest_file_list = data_store.list_files(prefix=additional_path +
                                                   MANIFEST_FILEPATH)
        list_of_topic_list = list()
        for manifest_file in manifest_file_list:
            eco_to_package_list_json_array = data_store.read_json_file(
                manifest_file)
            for eco_to_package_list_json in eco_to_package_list_json_array:
                ecosystem = eco_to_package_list_json.get(MANIFEST_ECOSYSTEM)
                list_of_package_list = eco_to_package_list_json.get(
                    MANIFEST_PACKAGE_LIST)
                for package_list in list_of_package_list:
                    package_list_lowercase = [x.lower() for x in package_list]
                    topic_list = cls.get_topic_list_for_package_list(
                        package_list_lowercase, ecosystem,
                        eco_to_package_topic_dict)
                    list_of_topic_list.append(topic_list)
        transactions = sc.parallelize(list_of_topic_list)
        transactions.cache()

        min_support = float(min_support_count / float(transactions.count()))

        model = FPGrowth.train(transactions,
                               minSupport=min_support,
                               numPartitions=fp_num_partition)

        return model
예제 #7
0
def chercher_produits():
    products = df.rdd.map(lambda x: x.produits)
    model = FPGrowth.train(products, minSupport=0.4, numPartitions=5)
    result = list(
        set(model.freqItemsets().flatMap(
            lambda itemset: itemset.items).collect()))
    print("Frequent items: %s" % str(result))
    return jsonify(result)
예제 #8
0
파일: FPM.py 프로젝트: qqwant/LearningLink
 def getConfident(self):
     f = udf(lambda x: float(len(x)), FloatType())
     rdd = self.df.rdd.flatMap(lambda x: x[0])
     model = FPGrowth.train(rdd, self.support, 2)
     rules = model._java_model.generateAssociationRules(
         self.confidence).collect()
     ls = [[i.javaAntecedent()[0],
            i.javaConsequent()[0],
            i.confidence()] for i in rules if len(i.javaAntecedent()) == 1]
     return spark.createDataFrame(ls, ['l', 'r', 'confidencePositive'])
예제 #9
0
def writeToFile(rdd):
    with open("count.txt", "w") as f:
        f.write(str(rdd.count()))
    rdd_words = rdd.map(lambda line: list(
        filter(lambda a: a != "" and a not in stop_words,
               list(set(line.strip().split(' ')))))).filter(lambda x: x != [])
    model = FPGrowth.train(rdd_words, minSupport=0.02, numPartitions=20)
    result = model.freqItemsets().collect()
    with open("frequent_items.txt", "w") as g:
        for i in range(5):
            g.write(json.dumps(result[i].items) + "\n")
    def run_FPM(tweets, collection):
        model = FPGrowth.train(tweets.select("filtered").rdd.map(lambda x: x[0]), minSupport=0.02)
        result = sorted(model.freqItemsets().collect(), reverse=True)
        # sort the result in reverse order
        sorted_result = sorted(result, key=lambda item: int(item.freq), reverse=True)

        # save output to file
        with codecs.open(globals.FP_dir + "/" + time.strftime("%Y%m%d-%H%M%S") + '_'
                                + collection["Id"] + '_'
                                + collection["name"] + '.txt', 'w',encoding='utf-8') as file:
            for item in sorted_result:
                file.write("%s %s\n" % (item.freq, ' '.join(item.items)))
예제 #11
0
def similar_items_for_type(rdd, index, type):
    print("Calculating similar items for type: ", type)
    new_rdd = rdd.map(lambda row: [row[0]]+(row[index] if row[index] else []))
    new_rdd = new_rdd.map(lambda row: list(set(row)))
    model = FPGrowth.train(new_rdd, minSupport=0.001, numPartitions=4)
    freq_items_sets = model.freqItemsets().collect()
    item_to_sim = {}
    for freq_item_set in freq_items_sets:                                                 
        items = set(freq_item_set[0]) 
        for item in items:
            item_to_sim.setdefault(item, set()).update(items.difference(set([item])))
    return item_to_sim
예제 #12
0
def main(sc, argv):
    inputfile = ''
    outputfile = ''
    sigma = 0

    """
    Parse command line option

    * inputfile: csv file containing the transaction data (delimiter is ' ')
    * outputfile: csv file containing frequent item sets of size 3 or more
                  with support greater than sigma
    * sigma: minimum support required (should be positive integer)
    """
    try:
        opts, args = getopt.getopt(argv[1:],
                                   "hi:o:s:", ["ifile=", "ofile=", "sigma="])
    except getopt.GetoptError:
        usage(argv[0])
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage(arg[0])
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-s", "--sigma"):
            sigma = int(arg)

    print 'Input file is ', inputfile
    print 'Output file is ', outputfile
    print 'Support level is ', str(sigma)
    if inputfile == '' or outputfile == '' or sigma <= 0:
        usage(argv[0])
        sys.exit(2)

    data = sc.textFile(inputfile)
    transactions = data.map(lambda line: line.strip().split(' '))
    """Compute minSupport from sigma"""
    minSupport = float((float(sigma))/transactions.count())
    print "MinSupport = " + str(minSupport)
    model = FPGrowth.train(transactions, minSupport, numPartitions=10)
    result = model.freqItemsets().collect()
    with open(outputfile, "w") as opf:
        for items, freq in result:
            if len(items) >= 3:
                """Write only frequent item sets with 3 or more items"""
                opf.write("%d, %d, %s\n" %
                          (len(items), freq, printItemSet(items)))
예제 #13
0
def FpGrowthWithFilterByOverall(dataframe, date, product):

    df = dataframe.select([product, date])
    df.na.drop()

    transactions_data = df.groupBy(date).agg(
        F.collect_list(product).alias("transactions")).rdd.map(
            lambda x: x.transactions)
    unique_transactions = transactions_data.map(lambda x: list(set(x))).cache()

    model = FPGrowth.train(unique_transactions, 0.2, 10)
    result = model.freqItemsets().collect()

    return result
예제 #14
0
def main(sc, argv):
    inputfile = ''
    outputfile = ''
    sigma = 0
    """
    Parse command line option

    * inputfile: csv file containing the transaction data (delimiter is ' ')
    * outputfile: csv file containing frequent item sets of size 3 or more
                  with support greater than sigma
    * sigma: minimum support required (should be positive integer)
    """
    try:
        opts, args = getopt.getopt(argv[1:], "hi:o:s:",
                                   ["ifile=", "ofile=", "sigma="])
    except getopt.GetoptError:
        usage(argv[0])
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            usage(arg[0])
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-s", "--sigma"):
            sigma = int(arg)

    print 'Input file is ', inputfile
    print 'Output file is ', outputfile
    print 'Support level is ', str(sigma)
    if inputfile == '' or outputfile == '' or sigma <= 0:
        usage(argv[0])
        sys.exit(2)

    data = sc.textFile(inputfile)
    transactions = data.map(lambda line: line.strip().split(' '))
    """Compute minSupport from sigma"""
    minSupport = float((float(sigma)) / transactions.count())
    print "MinSupport = " + str(minSupport)
    model = FPGrowth.train(transactions, minSupport, numPartitions=10)
    result = model.freqItemsets().collect()
    with open(outputfile, "w") as opf:
        for items, freq in result:
            if len(items) >= 3:
                """Write only frequent item sets with 3 or more items"""
                opf.write("%d, %d, %s\n" %
                          (len(items), freq, printItemSet(items)))
    def generate_freq_items(self):
        """Run Spark FP Growth to get the frequent item sets.

        :return: Only those frequent items sets where len(item_set) is either 4 or 5
        """
        sc = SparkContext.getOrCreate()
        rdd = sc.parallelize(self.all_list_of_package_list, NUM_PARTITIONS)
        model = FPGrowth.train(rdd, MIN_SUPPORT_COUNT, NUM_PARTITIONS)
        freq_item_sets = model.freqItemsets().collect()
        for item_set in freq_item_sets:
            item_set_len = len(item_set.items)
            if item_set_len == 4:
                self.freq_items_4.append(item_set.items)
            elif item_set_len == 5:
                self.freq_items_5.append(item_set.items)
        sc.stop()
예제 #16
0
def frequent_word_comb(parsed):
    # Create the transaction-like dataset for FP-Growth
    transactions = parsed.map(lambda line: split_tweet(line, True))

    # Run the FPGrwoth algorithm with the specified minimum support and number of partitions
    model = FPGrowth.train(transactions, minSupport=0.01, numPartitions=4)
    result = model.freqItemsets().collect()
    # Select only frequent itemsets that have 2 or more elements.
    pair_results = [itemset for itemset in result if len(itemset[0]) >= 2]
    # Sort in the descending order of frequency
    pair_results.sort(reverse=True, key=custom_sort)
    f = open('outputfile.txt', "a+")
    for pair in pair_results[:10]:
        f.write("Frequent Itemset: " + str(pair[0]) + "\tFrequency: " +
                str(pair[1]) + "\n")
    f.close()
예제 #17
0
    def gp_growth_demo(self):

        '''
        r z h k p
        z y x w v u t s
        s x o n r
        x z y m t s q e
        z
        x z y r q t p
        :return:
        '''
        data = self.sc.textFile(self.base + 'sample_fpgrowth.txt')
        transactions = data.map(lambda line: line.strip().split(' '))
        model = FPGrowth.train(transactions, minSupport=0.3, numPartitions=10)
        result = model.freqItemsets().collect()
        for fi in result:
            print(fi)
예제 #18
0
def FpGrowthWithFilterByCity(dataframe, date, product, city):

    df = dataframe.select([product, date, city])

    df = df.withColumn("_Products_",
                       F.concat(F.col(product), F.lit(","), F.col(city)))
    df.na.drop()

    transactions_data = df.groupBy(date).agg(
        F.collect_list("_Products_").alias("transactions")).rdd.map(
            lambda x: x.transactions)
    unique_transactions = transactions_data.map(lambda x: list(set(x))).cache()

    model = FPGrowth.train(unique_transactions, 0.2, 10)
    result = model.freqItemsets().collect()

    return result
예제 #19
0
    def create_model_text(self, data, params):

        minSupport = float(params.get('minSupport', 0.2))
        numPartitions = int(params.get('numPartitions', 10))
        limits = int(params.get('limits', 10))

        transactions = data.map(lambda line: line.strip().split(' '))

        model = FPGrowth.train(transactions,
                               minSupport=minSupport,
                               numPartitions=numPartitions)

        result = model.freqItemsets().collect()

        for index, fi in enumerate(result):
            if index == limits:
                break
            print(str(fi.items) + ':' + str(fi.freq))
예제 #20
0
def main():
    itemsets_path1 = "./itemsets.csv"
    min_supp = 0.0002
    itemsets = sc.textFile(
        itemsets_path1).map(lambda line: line.strip().split('\t')).map(
            lambda x: list(set(x)))  # items must be unique
    model = FPGrowth.train(itemsets, minSupport=min_supp)  # freq=50
    result = model.freqItemsets().collect()

    with open('./freq.csv', 'w') as fout:
        with open('./frqitems.csv', 'w') as fout2:
            for freqitemset in result:
                it = freqitemset.items  # list
                freq = freqitemset.freq
                for CUI in it:
                    fout.write(CUI + '\t')
                fout.write("\n")
                fout2.write(str(freq))
                fout2.write('\n')
예제 #21
0
    def fit(self, rdd, min_support):
        '''
		Mine frequent itemsets, using `pyspark.mllib.fpm.FPGrowth`

		param:
			`rdd`: PythonRDD, transactions
			`min_support`: float in [0, 1) or int in [1, inf]. If former, percentage of records; latter, number of records
			`n_partitions`: int, number of partitions
		'''
        self.rdd = rdd
        self.n = rdd.count()

        # Allow for passing "number of records" or percentage
        if min_support >= 1:
            min_support /= self.n

        model = FPGrowth.train(rdd, min_support, rdd.getNumPartitions())
        self.itemsets_df = model.freqItemsets().toDF()
        self._addl_itemset_setup()
예제 #22
0
    def fpgrowth(self):
        '''
        用户消费商户类型频繁项
        :return:
        '''

        tran_df = self.spark.load_from_mysql('t_CMMS_CREDIT_TRAN').filter("BILL_AMTFLAG = '+'").select('ACCTNBR',
                                                                                                 'MER_CAT_CD') \
            .filter("MER_CAT_CD != 0").filter("MER_CAT_CD != 6013")

        result = tran_df.map(lambda x: (str(int(x['ACCTNBR'])), [str(int(x['MER_CAT_CD'])), ])).groupByKey()

        def m(x):
            k = x[0]
            l = list(x[1])

            v = set()
            for i in l:
                v.add(i[0])

            return set(v)

        result = result.map(m)
        for i in result.take(10):
            print(i)

        model = FPGrowth.train(result, minSupport=0.05, numPartitions=10)
        result = model.freqItemsets().collect()
        single=[]
        many=[]

        for r in result:
            if len(r[0]) == 1:
                single.append(r)
            else:
                many.append(r)

        for i in single:
            print(i[0])

        for i in many:
            print(i[0])
예제 #23
0
def alg_fp_growth(data_set_rdd, threshold, num_of_partitions):
    start = time.time()
    model = FPGrowth.train(data_set_rdd, threshold, num_of_partitions)
    end = time.time()
    itemsets_calculation_time = end - start
    print 'Training took %s seconds' % itemsets_calculation_time
    start = time.time()
    result = model.freqItemsets().collect()
    result = {
        str(sorted(list(set(n.items)))): (set(n.items), n.freq)
        for n in result
    }
    result_copy = {k: v for k, v in result.iteritems()}
    res = {}
    for k, v in result.iteritems():
        if isCis(result_copy, v):
            res[k] = v
    end = time.time()
    collect_and_filter_time = end - start
    print 'Frequent itemsets collection and cis filter took %s seconds' % collect_and_filter_time
    return res, itemsets_calculation_time, collect_and_filter_time
예제 #24
0
def process_batch(df, epoch_id, topic_name):
    """
    Counts tweets in a batch, runs FPGrowth, and saves output to text file
    """
    # Open file in outputs folder in append mode
    file = open(f'outputs/{topic_name}', 'a')

    now = datetime.now()
    current_time = now.strftime("%d/%m/%y %H:%M:%S")
    file.write(f"Time: {current_time}\n")

    log.info(f"Custom Batch process for {topic_name}")
    log.debug(df.collect())
    log.info(f"Current Time: {current_time}, Epoch ID: {epoch_id}")

    tweet_count = df.count()
    log.info(f"Total tweets in batch: {tweet_count}")
    file.write(f"Total tweets in batch: {tweet_count}\n")

    if tweet_count > 3:
        log.info("Running FPGrowth")
        file.write("Frequent Itemsets:\n")
        # Remove duplicate entries in a row
        transactions = df.rdd.map(lambda line: line.value.split(" "))
        unique = transactions.map(lambda x: list(set(x)))

        model = FPGrowth.train(unique, minSupport=0.3)
        # Sort items based on frequency
        result = sorted(model.freqItemsets().collect(),
                        reverse=True,
                        key=lambda x: x[1])
        for fi in result:
            log.debug(fi)
            file.write(f'{fi}\n')
    else:
        file.write('Not running FPGrowth due to low no. of tweets\n')

    file.write('\n\n')
    file.close()
예제 #25
0
    def fpgrowth(self):
        '''
        frequent mining

        1.get the group of similar customers
        2.list the products these customers using
        3.run this fpgroup
        :return:
        '''

        data = [
            ['1', '1', '2'],
            ['2', '1', '1', '2'],
            ['P1', 'P3'],
            ['P3', 'P5', 'P4', 'P6'],
            ['P4', 'P5']
        ]
        rdd = self.sc.parallelize(data, 2).cache()
        model = FPGrowth.train(rdd, minSupport=0.3, numPartitions=10)
        result = model.freqItemsets().collect()
        for r in result:
            print(r)
예제 #26
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter(
        "description is not NULL").cache()
    # df_jobs = spark.read.json("newjobs4rdd/newjobs.jsonl").filter("description is not NULL").cache()
    tokenizer = Tokenizer(inputCol="description", outputCol="words")
    tokenized = tokenizer.transform(df_jobs)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    words = removed.select("filtered").rdd.map(lambda x: list(
        set(blacklist(lemmatize(strip_punctuation(x.filtered))))))
    model = FPGrowth.train(words, minSupport=0.1, numPartitions=1)
    finalDF = model.freqItemsets().map(
        lambda row: (' '.join(row.items), row.freq)).toDF(
            ["items", "freq"]).orderBy(desc("freq")).coalesce(1)
    finalDF.write.csv('fpjobs-newjobs-with-blacklist')
예제 #27
0
def getRulesByFPGrowth(FILENAME, iter1, iter2, classes, min_sup=0.1, min_conf=0.0, numPartitions=32, ratio=True) :
    
    # read data
    filepath = DIR_UCI+'/'+FILENAME+'/alpha/'+FILENAME+'-train'+str(iter1)+'-'+str(iter2)+'.txt'
    data = sc.textFile(filepath)
    print(filepath)
    transactions = data.map(lambda line: line.strip().split(' '))

    # 最小支持度を定める
    nrow = sum(1 for line in open(filepath))
    minSupport = float(min_sup)  if ratio == True else float(min_sup) / float(nrow)
     
    # model 定義
    model = FPGrowth.train(transactions, minSupport = minSupport, numPartitions=numPartitions)
    
    # クラスを含まない頻出アイテム集合だけを取り出す
    nocls_freq_item_sets = model.freqItemsets().filter(lambda fis: all(not x in fis.items for x in classes))
    # クラスを含む頻出アイテム集合でかつ長さが2以上のものを取り出す
    cls_freq_item_sets = model.freqItemsets().filter(lambda fis: any(x in fis.items for x in classes)).filter(lambda fis: len(fis.items) > 1).collect()
    rules = []
    
    #def getRule(cls_freq_item):
        # クラス以外の分が同じアイテムでかつ長さが1違いのアイテムを取り出す
    #    cls_freq_item = cls_freq_item.first()
    #    nocls_freq_item = nocls_freq_item_sets.filter(lambda ifs : all(x in cls_freq_item.items for x in ifs.items)).filter(lambda fis: len(fis.items) == len(cls_freq_item.items) - 1).first()
        #print(cls_freq_item)
        #print(nocls_freq_item) 
    #    conf = float(cls_freq_item.freq) / float(nocls_freq_item.freq)
    #    if conf >= min_conf:
    #        rule = Rule()
    #        rule.setValue(nocls_freq_item.items)
    #        cls = list(set(cls_freq_item.items) & set(nocls_freq_item.items))[0]
    #        rule.setConsequent(cls)
    #        rule.setSupport(cls_freq_item.freq)
    #        rule.setConf(conf)
    #        return(rule)
    #    else :
    #        return(None)
#
    #rules = cls_freq_item_sets.foreach(getRule)

    rules = []
    print("item count :"+str(len(cls_freq_item_sets)))
    for cls_freq_item in cls_freq_item_sets:
        
        # クラス以外の分が同じアイテムでかつ長さが1違いのアイテムを取り出す
    #    nocls_freq_item = nocls_freq_item_sets.filter(lambda ifs : all(x in cls_freq_item.items for x in ifs.items)).filter(lambda fis: len(fis.items) == len(cls_freq_item.items) - 1).first()
   
        #print(cls_freq_item)
    #    print(nocls_freq_item) 
        #for nocls_freq_item in nocls_freq_item_sets:
        #    # クラス以外の部分が同じアイテムでかつ長さが1違いのアイテムを取り出す
        #    cls_freq_item = cls_freq_item_sets.filter(lambda fis: (all(x in fis.items for x in nocls_freq_item.items))).filter(lambda fis: len(fis.items) == len(nocls_freq_item.items) + 1).collect()
        #    if cls_freq_item:
    #    conf = float(cls_freq_item.freq) / float(nocls_freq_item.freq)
    #    if conf >= min_conf:
        values = [x for x in cls_freq_item.items if not x in classes]
        cls = [x for x in cls_freq_item.items if x in classes][0]
        conf = 0.0
        rule = Rule()
        rule.setValue(values)
        #cls = list(set(cls_freq_item.items) & set(nocls_freq_item.items))[0]
        rule.setConsequent(cls)
        rule.setStrength(cls_freq_item.freq)
        rule.setConf(conf)
        rules.append(rule)

    return(rules)
import time
from pyspark.sql.functions import *
from pyspark.mllib.fpm import FPGrowth, PrefixSpan

# sc is an existing SparkContext.
sqlContext = HiveContext(sc)

# load i2b2 data
data = sc.textFile("/Users/jayurbain/Dropbox/machine-learning/machine-learning/data/sample_fpgrowth.txt")

print data.take(10)

# fpgrowth example
transactions = data.map(lambda line: line.strip().split(' '))
print transactions.take(5)
model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model.freqItemsets().collect()
for fi in result:
    print(fi)

for i in result:
    print '(', ', '.join(i.items), ')', 'freq=', str(i.freq)

#############################################

data = [["a", "b", "c"], ["a", "b", "d", "e"], ["a", "c", "e"], ["a", "c", "f"]]
rdd = sc.parallelize(data, 2)
model = FPGrowth.train(rdd, 0.6, 2)
sorted(model.freqItemsets().collect())

####################################################
예제 #29
0
sc = SparkContext()

from pyspark.mllib.fpm import FPGrowth

nums = set()
reader = open(
    '/home/edu/songsong/mif_2019/fraud_detection/output/dataOfYZJPTC.csv')
for num in reader:
    num = num.strip("\n").split(',')
    nums.add(num[0])

data = sc.textFile("/mif/data_new/worker_hospital_detail.txt")
data = data.map(lambda line: line.split(','))
# num 0 ,medical_name 2 ,count 4
data_ngs = data.filter(lambda line: line[0] in nums and len(line) > 4)
#basket
data_bkt_withNum = data_ngs.map(lambda line: ((line[0], line[2]), 1)) \
    .reduceByKey(lambda a, b: a) \
    .map(lambda (k, v): (k[0], [k[1]])) \
    .reduceByKey(lambda a, b: a + b)

data_bkt = data_bkt_withNum.map(lambda (k, v): v)
data_bkt.cache()
model = FPGrowth.train(data_bkt, 0.01)
fitems = model.freqItemsets().collect()
out = open('output/fpm_yzjptc.txt', 'w')
for itemset in fitems:
    line = reduce(lambda a, b: "%s\t%s" % (a, b),
                  itemset.items).encode("utf-8")
    out.write("%d\t%s\n" % (itemset.freq, line))
out.close()
예제 #30
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# $example on$
from pyspark.mllib.fpm import FPGrowth
# $example off$
from pyspark import SparkContext

if __name__ == "__main__":
    sc = SparkContext(appName="FPGrowth")

    # $example on$
    data = sc.textFile("data/mllib/sample_fpgrowth.txt")
    transactions = data.map(lambda line: line.strip().split(' '))
    model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
    result = model.freqItemsets().collect()

    for fi in result:
        print(fi)
    # $example off$

    minConfidence = 0.8
    associations = model.generateAssociationRules(minConfidence).collect()

    for ar in associations:
        print(ar)
예제 #31
0
from pyspark import SparkContext 
from pyspark import SparkConf
from pyspark.mllib.fpm import FPGrowth
import sys, operator
import re, string



inputs = sys.argv[1]
output = sys.argv[2]

conf = SparkConf().setAppName('frequent itemsets')
sc = SparkContext()

text = sc.textFile(inputs)

transactions = text.map(lambda line: map(int,line.split()))

model = FPGrowth.train(transactions, 0.0002).freqItemsets().map(lambda (w,z):(sorted(w),z))

modelsort=model.sortBy(lambda (w,c): (-c,w)).map(lambda (w,c): u"%s %i" % (w, c)).take(10000)

modelsort1=sc.parallelize(modelsort,1)

modelsort1.saveAsTextFile(output)
예제 #32
0
# -*- coding:utf-8 -*-
""""
Program: FPGrowth
Description:调用spark内置的fpgrowth算法示例
Author: zhenglei - [email protected]
Date: 2016-01-14 13:36:09
Last modified: 2016-01-14 13:37:01
Python release: 2.7
"""

# 调用spark内置的fp-growth算法,实现机器学习实战中的第十二章示例
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth

if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('kosarak.dat')
    datas = tmpdatas.map(lambda line: line.strip().split(' '))
    # tmpdatas = sc.textFile('/opt/spark-1.6.0/data/mllib/sample_fpgrowth.txt')
    # datas = tmpdatas.map(lambda line: line.strip().split(' '))
    model = FPGrowth.train(datas, minSupport=0.1)
    results = model.freqItemsets().collect()
    for item in results:
        print item
    sc.stop()
예제 #33
0
# -*- coding:utf-8 -*-
""""
Program: FPGrowth
Description:调用spark内置的fpgrowth算法示例
Author: zhenglei - [email protected]
Date: 2016-01-14 13:36:09
Last modified: 2016-01-14 13:37:01
Python release: 2.7
"""

# 调用spark内置的fp-growth算法,实现机器学习实战中的第十二章示例
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth


if __name__ == '__main__':
    sc = SparkContext()
    tmpdatas = sc.textFile('kosarak.dat')
    datas = tmpdatas.map(lambda line: line.strip().split(' '))
    # tmpdatas = sc.textFile('/opt/spark-1.6.0/data/mllib/sample_fpgrowth.txt')
    # datas = tmpdatas.map(lambda line: line.strip().split(' '))
    model = FPGrowth.train(datas, minSupport=0.1)
    results = model.freqItemsets().collect()
    for item in results:
        print item
    sc.stop()
예제 #34
0
.master("local")\
.appName("RDD_and_DataFrame") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

spark.conf.set("spark.sql.execution.arrow.enabled", "true")
#执行sql语句,将订单数据整理成一个个事务
sentenceData = spark.sql("读取数据")
#将Dataframe类型转换成rdd进行map
rdd=sentenceData.rdd
#将数据格式转换成[[itemid,itemid,...],[itemid,itemid,...],[itemid,itemid,...]],[itemid,itemid,...]表示一组事务,也就是同以订单中的物品组合
rdd=rdd.map(lambda x: x[1].split(",")).collect()
#转换成模型需要的数据格式rdd类型,新版本可能模型有变化
rdd=spark.sparkContext.parallelize(rdd)
#后面的参数分别表示最低支持度,2 表示输入的rdd 分成几个partition来做处理
model = FPGrowth.train(rdd, 0.00001, 2)
skuresult=model.freqItemsets().collect()
#skuresult为从订单中挖掘出来的频繁项集



2.关联规挖掘(频繁项集)
根据频繁项集找到对应的强关联规则:
找到k项频繁项集,将k项频繁项集进行关联规则提取,1对1,1对多,多对多
k项频繁项集假设为{1,2,3}
具体步骤:
step1:求出所有非空子集{1},{2},{3},{1,2},{2,3},{1,3}
step2:任意取两个不想交的非空子集生成一个规则:如{1}--》{2,3}
step3:判断规则是否为强关联规则,如果是则进行存储规则表中可用dict存储以便于后续用来进行匹配推荐。
step4:循环2,3步
예제 #35
0
def stopwords_remover(text_list):
        res = []
        for word in text_list:
                if len(word)>2:
                        if word not in stopwords:
                                res.append(word)
        return res

#keeps only one word's occurrence for basket
def duplicate_remover(text_list):
        return dict.fromkeys(text_list).keys()

#find text inside tweet's data and
def parse_text(tweet):
        #search text
        res = re.search('\"text\" : "(.*)" , \"in_reply_to_status_id\"', tweet).group(1)
        #remove bad characters
        res = text_cleaner(res)
        #remove words that appear more than once and next remove stopwords
        return stopwords_remover(duplicate_remover(res.split()))


sc = SparkContext(appName="testFPGrowth")
stopwords = open('/home/e01/stopwords.txt','r').read().splitlines()
rdd_tweets = sc.textFile(large_file).sample(False, sample_size, 42).map(lambda $

model = FPGrowth.train(rdd_tweets, minSupport=0.02, numPartitions=1000)
result = model.freqItemsets().collect()
for fi in result:
    print(fi)
예제 #36
0
파일: 1.py 프로젝트: mehrdadalmasi1/third
except ImportError as e:
    print("Can not import Spark Modules", e)

try:
    from pyspark import SparkContext
    from pyspark import SparkConf

    from pyspark.mllib.fpm import FPGrowth
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark import SparkContext
    from pyspark.conf import SparkConf

    conf = SparkConf()

    conf.setMaster("local").setAppName('FPgrowth-notebook').set(
        "spark.executor.memory", "50g")

    sc = SparkContext(conf=conf)
    data = sc.textFile("/home/mehrdad/Downloads/Text.csv", 10)
    transactions = data.map(lambda line: line.strip().split(' '))
    model = FPGrowth.train(transactions, minSupport=0.0001, numPartitions=10)
    result = model.freqItemsets().collect()
    f2 = open("/home/mehrdad/Downloads/RulesOfBank.txt", 'w')
    for fi in result:
        print(fi)
        f2.write(fi)
    sc.stop()
except ImportError as e:
    print("Can not import Spark Modules", e)
예제 #37
0
    from_addr = '*****@*****.**',
    to_addr = '*****@*****.**',
    subject = 'subject',
    text_body = 'sdfasf')
envelope.send('172.16.8.28')
#envelope.send('172.16.8.28', login='******',password='******', tls=True)



from clockwork import clockwork
api = clockwork.API('2bba3f5cb100cb0b3e1085c6b546a1ffe2f2cec8')
message = clockwork.SMS(to = '9910055945', message = 'This is a test message.')
response = api.send(message)


----------------------------
from pyspark.mllib.fpm import FPGrowth
from pyspark.mllib.evaluation import RankingMetrics
from pyspark import SparkContext
#sc = SparkContext(appName='aslkjsdf')
data = sc.textFile('/spark-data/input/eval_100000')
header = data.first()
data1 = data.filter( lambda x: x != header)
data = data1.map(lambda x: x.split(",")).map( lambda x: tuple([x[2],x[0]])).distinct()
data = data.groupByKey().mapValues(list).map(lambda x : x[1])
data.saveAsTextFile('/spark-data/testing/formatted_data')
train,test = data.randomSplit([7, 3], 0)
model = FPGrowth.train(data, minSupport=0.01)
result = model.freqItemsets()
result.saveAsTextFile('/spark-data/testing/ouput/complete')
예제 #38
0
from pyspark.mllib.fpm import FPGrowth
import sys, operator


inputs = sys.argv[1] #input
output = sys.argv[2] #output

conf = SparkConf().setAppName('frequent itemsets')
sc = SparkContext(conf=conf)

text = sc.textFile(inputs)

""" sbaronia - taking itemsets in int form and splitting then
of spaces, else ' ' becomes an itemset
"""
items = text.map(lambda line: map(int, line.strip().split(' ')))

""" sbaronia - calling FPGrowth function with support
0.0022 and partition 1, will give more than 10k frequent itemsets
"""
model = FPGrowth.train(items, 0.0022, 1)
fitems = model.freqItemsets()

""" sbaronia - here we sort every transaction in ascending order and 
then the entire 10k by descending order of frequencies and make 
and rdd from list of 10k items
"""
sort_transactions = sc.parallelize(fitems.map(lambda (i,c): (sorted(i), c)).sortBy(lambda (i,c): (-c,i)).take(10000))

sort_transactions.saveAsTextFile(output)
예제 #39
0
# This program trains and fits a FPGrowth model using RDD for finding frequent
# patterns from the data

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.fpm import FPGrowth

sc = SparkContext.getOrCreate(SparkConf())
#data = sc.textFile("hdfs://worker2.hdp-internal:8020/user/ketkid1/calls.txt")
data = sc.textFile(
    "hdfs://worker2.hdp-internal:8020/user/ketkid1/calldata.txt")

# remove the empty lines present in RDD
data = data.filter(lambda line: line not in '')

# split each line on comma
calls = data.map(lambda line: line.strip().split(','))

# remove duplicates if any and cache the input data
unique = calls.map(lambda x: list(set(x))).cache()

# train the FP Growth model and predict the result
model = FPGrowth.train(calls, minSupport=0.02, numPartitions=2)
result = model.freqItemsets().collect()

# print the result
for i in result:
    print(i)
예제 #40
0
def FPGrowthRDD(transactionsRDD, minSupport=0.2, numPartitions=10):
    '''
    perform the FPGrowth algorithm
    '''
    model = FPGrowth.train(transactionsRDD, minSupport, numPartitions)
    return model.freqItemsets()
예제 #41
0
# print(data.take(num_obs),'\n')
data = data.map(lambda x: x[1])
# print(data.take(num_obs),'\n')
data = data.map(lambda line:re.sub('[^A-Za-z0-9]+', ',', line))
print(data.take(num_obs),'\n')
data = data.map(lambda line:re.sub(num_rep, 'NUM', line))
print(data.take(num_obs),'\n')
data = data.map(lambda line: line.lower().strip().split(',')[:3])
print(data.take(num_obs),'\n')
data = data.map(lambda line: [elem+'_P'+str(idx+1) for (elem,idx) in zip(line,range(len(line)))])
print(data.take(num_obs),'\n')


# In[53]:

datamodel = FPGrowth.train(data,minSupport=0.001,numPartitions=10)


# In[9]:

# data.saveAsTextFile(processed_data_out)


# In[10]:

#df=data.toDF()
# data.take(100)
#data.getNumPartitions()


# In[15]:
예제 #42
0
sc = SparkContext(conf=conf)

text = sc.textFile(inputs1)


def split_items(ts):
    items_list = []
    for transaction in ts:
        items = transaction.split()
        int_items = [int(i) for i in items]
        items_list.append(int_items)
    return items_list


transactions = text.map(lambda line: line).collect()
transaction_list = split_items(transactions)
rdd = sc.parallelize(transaction_list, 6)
model = FPGrowth.train(rdd, minSupport=0.002, numPartitions=10)
frequent_sets = model.freqItemsets().collect()

frequent_tuples = sc.parallelize(frequent_sets).map(lambda (items, freq): (sorted(items), freq)).coalesce(1).collect()
frequent_tuples.sort(key = lambda r: r[0])
frequent_tuples.sort(key = lambda r: r[1], reverse = True)

top10k = sc.parallelize(frequent_tuples).take(10000)
output_data = sc.parallelize(top10k).coalesce(1)

output_data.saveAsTextFile(output)


예제 #43
0
conf = SparkConf().setAppName("itemsets")
sc = SparkContext(conf=conf)

text = sc.textFile(inputs1)


def split_items(ts):
    items_list = []
    for transaction in ts:
        items = transaction.split()
        int_items = [int(i) for i in items]
        items_list.append(int_items)
    return items_list


transactions = text.map(lambda line: line).collect()
transaction_list = split_items(transactions)
rdd = sc.parallelize(transaction_list, 6)
model = FPGrowth.train(rdd, minSupport=0.002, numPartitions=10)
frequent_sets = model.freqItemsets().collect()

frequent_tuples = sc.parallelize(frequent_sets).map(
    lambda (items, freq): (sorted(items), freq)).coalesce(1).collect()
frequent_tuples.sort(key=lambda r: r[0])
frequent_tuples.sort(key=lambda r: r[1], reverse=True)

top10k = sc.parallelize(frequent_tuples).take(10000)
output_data = sc.parallelize(top10k).coalesce(1)

output_data.saveAsTextFile(output)
예제 #44
0
            f.write(output)
    cmd_put = "hadoop fs -put " + outputdir + " /test"
    cmd_rm = "hadoop fs -rm /test/" + outputdir.split("/")[-1]
    (iRet, RetInfo) = commands.getstatusoutput(cmd_put)
    if iRet != 0:
        commands.getstatusoutput(cmd_rm)
        commands.getstatusoutput(cmd_put)


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: fpmining_spark <infile> miniSupport(0,1) <outfile>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonFpMining")
    # input pre-process
    data = sc.textFile(sys.argv[1], 1)
    fSupport = float(sys.argv[2])
    outDir = sys.argv[3]

    # data pre-process
    transactions = data.map(lambda line: list(set(line.strip().split(","))))
    new_trans = transactions.filter(lambda t: len(t) > 1)

    # FPGrowth
    model = FPGrowth.train(new_trans, minSupport=fSupport, numPartitions=10)
    result = model.freqItemsets().collect()
    # for fi in result:
    #    print(fi.items, fi.freq)
    sc.stop()
    GetFreqItems(result, outDir)
예제 #45
0
trans = transactions.select("items")
trans = trans.collect()
a = [(item) for sublist in trans for item in sublist]
a = sc.parallelize(a)
#model = FPGrowth.train(a, minSupport=0.2, numPartitions=10)
#result = model.freqItemsets().collect()

#for fi in result:
#    print(fi)

from pyspark.mllib.fpm import FPGrowth

#model = FPGrowth.train(transactions, minSupport=0.3, numPartitions=5)
#result = model.freqItemsets().collect()

fpGrowth = FPGrowth.train(a[:20], minSupport=0.1, minConfidence=0.6)
model = fpGrowth.fit(a[:20])

model.associationRules.show(1)
model.freqItemsets.show(1)

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
#model.transform(transactions).show(10)

#import pyspark.ml.stat
#icecream = orders.join(order_products, orders.order_id == order_products.order_id)
#icecream = icecream.select("order_hour_of_day","product_id")
#icecream = icecream.join(products, icecream.product_id == products.product_id)
#icecream = icecream.select("order_hour_of_day","product_name").show()
##icecream = icecream.filter(icecream.product_name=="Ice cream").show()
#print(DescriptionGrp.rdd.take(2))
minSupport = 0.05 * DescriptionGrp.rdd.count()
apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey(
    lambda x, y: x + y)
schema = StructType([
    StructField("id", StringType(), True),
    StructField("items", ArrayType(StringType()), True)
])
transactions = spark.createDataFrame(apr_tem, schema)
print(transactions.show(2))
##transactions_fp=apr_tem.map(lambda x: (x[1]))
#print(transactions_fp.take(2))
#schema = StructType([StructField("test_123",ArrayType(StringType(),True),True)])
#fields = [StructField(field_name, StringType(), True) for field_name in schema.split(',')]
#schema = StructType(fields)
##final_transactions_rdd = sc.parallelize(transactions_fp.collect())
##final_transactions = final_transactions_rdd.map(lambda x : ','.join(x))
##print(final_transactions.take(2))
#transactions = spark.createDataFrame([final_transactions])
##transactions = final_transactions.map(lambda line: line.strip().split(','))
##print(transactions.take(2))
fpgrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
##fpgrowth = FPGrowth(minSupport=0.5, minConfidence=0.6)
model = fpgrowth.fit(transactions)
# Display frequent itemsets.
model.freqItemsets.show()
# Display generated association rules.
model.associationRules.show()
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()
예제 #47
0
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.fpm import FPGrowth
import sys, operator
import re, string

inputs = sys.argv[1]
output = sys.argv[2]

conf = SparkConf().setAppName('frequent itemsets')
sc = SparkContext()

text = sc.textFile(inputs)

transactions = text.map(lambda line: map(int, line.split()))

model = FPGrowth.train(
    transactions, 0.0002).freqItemsets().map(lambda (w, z): (sorted(w), z))

modelsort = model.sortBy(lambda (w, c): (-c, w)).map(lambda (w, c): u"%s %i" %
                                                     (w, c)).take(10000)

modelsort1 = sc.parallelize(modelsort, 1)

modelsort1.saveAsTextFile(output)
print splitted_explanation.take(5)


# In[12]:

distincted_set = splitted_explanation.map(lambda line: distinct_set(line))
distincted_set.take(5)


# In[13]:

distinctedlist = distincted_set.map(lambda line: distinct_list(line))
distinctedlist.take(5)


# In[14]:

from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(distinctedlist, minSupport=0.001, numPartitions=1000)

result = model.freqItemsets().collect()

for fi in result:
    print(fi)


# In[ ]: