def main():
    args = parse_args()

    combined_aps = parse_grouped_aps(args.g)

    with open(args.a, 'r') as infile:
        aps = [line.rstrip() for line in infile]

    data = parse_data_files(aps, args.d)
    print "Num records: ", len(data)
    sc = mapreduce()
    # sc = SparkContext(appName="NetflixProblemApp")

    result = (sc.parallelize(data, 128).map(mapper0).reduceByKey(
        reducer).flatMap(mapper1).reduceByKey(reducer).filter(
            lambda x: x[0] not in combined_aps).flatMap(mapper3).reduceByKey(
                reducer).sortBy(lambda x: len(x[1])).collect())

    sc.stop()

    for record in result:
        date_elem, ap_conns = record
        output = filter(lambda x: len(x[1]) > NUM_JUMPS_THRESHOLD, ap_conns)
        day = str(date_elem.day)
        month = str(date_elem.month)
        year = str(date_elem.year)
        with open(args.o + month + '-' + day + '-' + year[2:], 'w') as outfile:
            json.dump(output, outfile, indent=4, encoding='latin1')
Exemplo n.º 2
0
def main():
    args = parse_args()

    raw_zones, aps = parse_grouped_aps(args.g)
    zones = {}
    for zone in raw_zones:
        for ap in zone['aps']:
            zones[ap] = str(zone['zone'])

    data = parse_data_files(aps, args.d, zones)
    print "Num records: ", len(data)
    sc = mapreduce()
    # sc = SparkContext(appName="NetflixProblemApp")

    result = (sc.parallelize(data, 128).map(mapper0).reduceByKey(reducer)
                                                    .flatMap(mapper1).reduceByKey(reducer)
                                                    .flatMap(mapper3).reduceByKey(reducer)
                                                    .sortBy(lambda x: len(x[1])).collect())

    sc.stop()

    for record in result:
        date_elem, ap_conns = record
        output = filter(lambda x: len(x[1]) > NUM_JUMPS_THRESHOLD, ap_conns)
        day = str(date_elem.day)
        month = str(date_elem.month)
        year = str(date_elem.year)
        with open(args.o + month + '-' + day + '-' + year[2:] + '.json', 'w') as outfile:
            json.dump(output, outfile, indent=4, encoding='latin1')
def main():
    ratings = read_netflix_ratings()
    map_reducer = mapreduce()
    pipeline = map_reducer.parallelize(ratings, 128)

    similar_table = pipeline.map(mapper1) \
        .reduceByKey(reducer) \
        .flatMap(mapper2) \
        .reduceByKey(reducer) \
        .flatMap(mapper3) \
        .reduceByKey(reducer) \
        .flatMap(mapper4) \
        .reduceByKey(reducer) \
        .flatMap(mapper5)
    recommend_result = []
    print(
        '******************************* Recommendation results ***********************************'
    )
    for item in similar_table.collect():
        recommend_result.append(item)
        print(item)
    print(
        '*********************************** Task 6 *****************************************'
    )
    df = task_6(recommend_result)
    return df
def main():
    # Get the ratings from ratings.csv
    ratings = read_netflix_ratings()

    # Initialize MapReduce
    map_reducer = mapreduce()
    pipeline = map_reducer.parallelize(ratings, 128)

    # DO NOT MODIFY THIS!
    similar_table = pipeline.map(mapper1) \
        .reduceByKey(reducer) \
        .flatMap(mapper2) \
        .reduceByKey(reducer) \
        .flatMap(mapper3) \
        .reduceByKey(reducer) \
        .flatMap(mapper4) \
        .reduceByKey(reducer) \
        .flatMap(mapper5)
    recommend_result = []
    print('******************************* Recommendation results ***********************************')
    for item in similar_table.collect():
        recommend_result.append(item)
        print(item)
    print('*********************************** Task 6 *****************************************')
    df = task_6(recommend_result)
    df2 = task_6_2(recommend_result)
    display(df2)
    return df
Exemplo n.º 5
0
 def get(self):
     import mapreduce
     il_ve_sayi = mapreduce.mapreduce()
     cc = self.db.kullanici.find()
     if self.current_user:
         self.render("index.html",kullanici_adi=self.get_user_name(),kullanicilar=cc,ilsayi = il_ve_sayi)
     else:
         self.render("index.html",kullanici_adi="",kullanicilar=cc,ilsayi = il_ve_sayi)
def main():

    with open('condensedStats.csv', 'rb') as f:
        data = [line.split(',') for line in f]

        sc = mapreduce()
        result = sc.parallelize(data[1:], 128) \
                            .map(mapper1) \
                            .reduceByKey(reducer) \
                            .sortByKey(True) \
                            .collect()

        sc.stop()

        topVids = result[len(result)-51:]
        l = []

        for vid in topVids:
            l.extend(vid[1][3].lower().split(';'))

        counter = collections.Counter(l)

        with open('mostCommonTags.csv', 'wb') as c:
            writer = csv.writer(c)
            writer.writerow(['Tag', 'Count'])

            for key,count in counter.most_common():
                writer.writerow([key, count])

        with open('commentsFile.csv', 'wb') as c:
            writer = csv.writer(c)
            writer.writerow(['Id', 'Title', 'Description', 'Comments (; delimited list)'])

            regex = re.compile('[%s]' % re.escape(string.punctuation))

            for vid in topVids:
                try:
                    comments = json.load(urllib2.urlopen(url + vid[1][0] + '&key=' + api_key))
                except Exception as e:
                    print(e)
                    print(vid[1][0])
                    continue

                commentList = ''

                if comments['items']:
                    thread = []

                    for item in comments['items']:
                        if 'textDisplay' in item['snippet'].get('topLevelComment', {}).get('snippet', {}):
                            comm = re.sub(r'http\S+|www.\S+|href\S+', '', item['snippet']['topLevelComment']['snippet']['textDisplay'])
                            date = item['snippet']['topLevelComment']['snippet']['publishedAt']
                            # comm = ' '.join(w for w in nltk.wordpunct_tokenize(comm) if w.lower() in engWords or not w.isalpha())
                            thread.append(regex.sub('', comm) + '|' + date)

                    commentList = ';'.join(thread)

                writer.writerow([vid[1][0], vid[1][1], vid[1][2], commentList.encode('utf8').decode('unicode_escape').encode('ascii','ignore')])
Exemplo n.º 7
0
def word_count(title_text_pairs, verbose=False):
    def map_f(_title, text):
        for word in text.split(' '):
            yield((word, 1))   

    def reduce_f(word, counts):
        yield((word, sum(counts)))

    return mapreduce(map_f=map_f, combine_f=reduce_f, reduce_f=reduce_f, verbose=verbose)(title_text_pairs)
def SpMultiply(A, B):
    #     assert A.n == B.n
    sijv = [('a', ijv[0][0], ijv[0][1], ijv[1]) for ijv in A.ijv]
    sijv += [('b', ijv[0][0], ijv[0][1], ijv[1]) for ijv in B.ijv]
    map_reducer = mapreduce()
    print('MapReduce input:')
    pprint(sijv)

    matrix_multi = map_reducer.parallelize(sijv, 128) \
        .flatMap(mapper1) \
        .reduceByKey(reducer) \
        .flatMap(mapper2) \
        .reduceByKey(reducer)
    print('MapReduce Output:')
    final_matrix = []
    for item in matrix_multi.collect():
        print(item)
        final_matrix.append(item)
    return SpMatrix(A.m, B.n, final_matrix)
Exemplo n.º 9
0
def main():

    args = parse_args()
    raw_zones, aps = parse_grouped_aps(args.g)
    zones = {}
    for zone in raw_zones:
        for ap in zone['aps']:
            zones[ap] = str(zone['zone'])

    data = parse_data_files(aps, args.d, zones)
    print "Num Records: ", len(data)

    sc = mapreduce()
    #sc = SparkContext(appName="CountsApp")

    result = (sc.parallelize(
        data,
        128).map(mapper0).reduceByKey(reducer).flatMap(mapper1).reduceByKey(
            reducer).map(mapper2).reduceByKey(reducer).collect())

    zone_counts = {}
    for zone, counts in result:
        zone_counts[zone] = counts

    for i in range(NUM_DAYS):
        day_start = START_TIME + DAY * i
        date_elem = date.fromtimestamp(day_start)
        subset = defaultdict(dict)
        subset["interval"] = COUNT_INTERVAL
        subset["start_time"] = day_start
        for zone in zone_counts:
            counts = zone_counts[zone]
            for j in range(int(DAY / COUNT_INTERVAL)):
                interval = day_start + j * COUNT_INTERVAL
                if interval in counts:
                    subset[zone][interval] = counts[interval]
        day = str(date_elem.day)
        month = str(date_elem.month)
        year = str(date_elem.year)
        with open(args.o + month + '-' + day + '-' + year[2:] + '.json',
                  'w') as outfile:
            json.dump(subset, outfile, indent=4, encoding='latin1')
Exemplo n.º 10
0
 def run_mapreduce(self):
     '''
     Parameters:
         Input: None
         Output: Clusters
     Purpose:
         This function fetches all the data from the database. From here it formats in a way that will
         run on mapreduce. It then runs the maps and reducers the amount of iterations you set. It then
         returns the clusters
     '''
     conn = sqlite3.connect('data/playlist_data.db')
     conn.text_factory = lambda x: str(x, 'latin1')
     c = conn.cursor()
     c.execute("""select * from songs;""")
     d = c.fetchall()
     data = []
     for i in range(len(d)):
         key = [
             d[i][0], d[i][1], d[i][2], d[i][3], d[i][4], d[i][5], d[i][6],
             d[i][7]
         ]
         values = []
         for y in range(8, len(d[i])):
             values.append(d[i][y])
         data.append([key, values])
     for i in range(len(data)):
         data[i] = [(0), data[i]]
     data2 = copy.deepcopy(data)
     for i in range(self._numClusters):
         self._centroidRandomNodes.append([(i), data2[i][1][1]])
     sc = mapreduce()
     output = []
     new_data = data
     result = []
     for i in range(self._numIterations):
         result = (sc.parallelize(data,128).map(self.mapper1) \
                   .reduceByKey(self.reducer) \
                   .map(self.mapper2) \
                    .collect())
     sc.stop()
     return result
Exemplo n.º 11
0
def main():
    # Get the ratings from ratings.csv
    ratings = read_netflix_ratings()

    # Initialize MapReduce
    map_reducer = mapreduce()
    pipeline = map_reducer.parallelize(ratings, 128)

    # DO NOT MODIFY THIS!
    similar_table = pipeline.map(mapper1) \
        .reduceByKey(reducer) \
        .flatMap(mapper2) \
        .reduceByKey(reducer) \
        .flatMap(mapper3) \
        .reduceByKey(reducer) \
        .flatMap(mapper4) \
        .reduceByKey(reducer) \
        .flatMap(mapper5)

    for item in similar_table.collect():
        print(item)
Exemplo n.º 12
0
def main():
    args = parse_args()

    with open(args.a, 'r') as infile:
        aps = [line.rstrip() for line in infile]

    data = parse_data_files(aps, args.d)
    print "Num records: ", len(data)
    sc = mapreduce()
    # sc = SparkContext(appName="NetflixProblemApp")

    similarities_result = (
        sc.parallelize(data, 128).map(mapper0).reduceByKey(reducer).flatMap(
            mapper1).reduceByKey(reducer).map(mapper2)
        #.sortBy(lambda x: (x[0][0], x[0][1])).collect())
        .sortBy(lambda x: (x[1])).collect())

    sc.stop()

    with open(args.o or dirname(realpath(__file__)) + '/connections.json',
              'w') as outfile:
        json.dump(similarities_result, outfile, indent=4, encoding='latin1')
Exemplo n.º 13
0
def main():
    archivos = ['archivo1.txt', 'archivo2.txt', 'archivo3.txt']

    res = mapreduce.mapreduce(archivos, f_map, f_reduce)

    mapreduce.printb(res)
Exemplo n.º 14
0
 def configure(self, env):
   import params
   env.set_params(params)
   mapreduce(name="historyserver")
Exemplo n.º 15
0
 def configure(self, env):
     import params
     env.set_params(params)
     mapreduce(name="historyserver")
Exemplo n.º 16
0
            word_op.append((word, 1))

    return word_op


def reducer_word_count(input):
    return sum(input)


def mapper_inverted_index(input, doc_id):
    import string
    input = input.translate(str.maketrans('', '', string.punctuation))
    input = input.split(" ")
    word_op = []

    for word in input:
        if word != "":
            word_op.append((word + "_" + doc_id, 1))

    return word_op


def reducer_inverted_index(input):
    return sum(input)

from mapreduce import mapreduce

input_location = r"./Input"
run_object = mapreduce(num_mappers=5, num_reducers=5)
run_object.run(input_location, mapper_word_count, reducer_word_count)
Exemplo n.º 17
0
 def configure(self, env):
     import params
     env.set_params(params)
     mapreduce()
Exemplo n.º 18
0
    def configure(self, env):
        import params

        env.set_params(params)
        mapreduce()
Exemplo n.º 19
0
from mapreduce.config import MapReduce, Mapper, Reducer
from mapreduce import mapreduce
import sys


class WC_Mapper(Mapper):
    def map(self, key, value):
        res = []
        for word in value.split(' '):
            res.append((word, 1))
        return res


class WC_Reducer(Reducer):
    def reduce(self, key, value):
        res = 0
        for elem in value:
            res += int(elem)
        return res


if __name__ == "__main__":
    out = sys.argv[1]
    wc_m = WC_Mapper()
    wc_r = WC_Reducer()

    config = MapReduce('./input', wc_m, wc_r, out)

    mapreduce(config)
Exemplo n.º 20
0
 def configure(self, env):
   import params
   env.set_params(params)
   mapreduce(name="jobtracker")