def main(self):
        if not os.path.exists(self.args.file):
            sys.exit('Input file %s not found' % self.args.file)
        if not os.path.exists(self.args.output):
            sys.exit('Output directory %s not found' % self.args.output)

        mrs.main(self.MAPREDUCE_CLASS, args=self._parsed_etl_args.job_args)
Пример #2
0
    def main(self):
        if not os.path.exists(self.args.file):
            sys.exit('Input file %s not found' % self.args.file)
        if not os.path.exists(self.args.output):
            sys.exit('Output directory %s not found' % self.args.output)

        mrs.main(self.MAPREDUCE_CLASS, args=self._parsed_etl_args.job_args)
        yield (ds,self.callback)

        itr = itr + 1
        while True:
            # iteratively map reduce (count counts)
            ds = job.map_data(ds,mapper=self.map_counts) # key=int, val=int
            ds = job.reduce_data(ds,self.reduce,outdir="%s/counts_of_counts%d"%(self.args[-1],itr),format=mrs.fileformats.TextWriter)
            itr = itr + 1
            yield (ds, self.callback)

    # count the counts
    def map_counts(self, key, count):
        yield (count, 1)

    # count the words
    def map_words(self, line_num, line_text):
        for word in line_text.split():
            word = word.strip(string.punctuation).lower()
            if word:
                yield (word, 1)

    # aggregate the counts
    def reduce(self, key, counts):
        yield sum(counts)


if __name__ == '__main__':
    mrs.main(IterativeWordCount)

# vim: et sw=4 sts=4
Пример #4
0
class invertedIndexing(mrs.MapReduce):
    
    # Mapper takes in a key, which is the indexNumber and a value, which is a line 
    def map(self, indexNumber, line):
        # Reads a line and creates list of the words in the line
        line = line.split() 
        for word in line:
            # Converts all letters to lowercase
            word = word.lower()
            # Converts every instance of punctuation to a space
            word = word.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
            #Only maps word once it has been validated that it only contains the alphabet, it's larger than 3 letters and is not a stopword
            if word.isalpha() and len(word) > 3 and word not in stopWords: 
               yield(word, indexNumber + 1)  # Returns lowercase word, stripped of all punstuation and the index number

    # Reducer takes in the key, which is a word, and a value, which is the index number and returns the list of indexNumbers
    def reduce(self, word, indexNumber):
        indexNumbers = []

        for currentIndex in indexNumber:
            # If statements to ensure only the first 50 lines are recorded and that if a word appears twice on one line, its only recorded once
            if len(indexNumbers) >= 50:
                break
            if currentIndex not in indexNumbers:
                indexNumbers.append(currentIndex)       
        yield(indexNumbers)

if __name__ == '__main__':
	mrs.main(invertedIndexing)

#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import mrs
import string

class WordCount(mrs.MapReduce):
    """Count the number of occurrences of each word in a set of documents.

    Word Count is the classic "hello world" MapReduce program. This is a
    working example and is provided to demonstrate a simple Mrs program. It is
    further explained in the tutorials provided in the docs directory.
    """
    def map(self, line_num, line_text):
        for word in line_text.split():
            word = word.strip(string.punctuation).lower()
            if word:
                yield (word, 1)

    def reduce(self, word, counts):
        yield sum(counts)

if __name__ == '__main__':
    mrs.main(WordCount)

# vim: et sw=4 sts=4
        self.items.append(node)

    def add_edge(self, edge):
        self.items.append(edge)

    def get_path_string(self, remove_cycles=True, lexicalize=False,
            max_length=5):
        to_output = []
        i = len(self.items) - 1
        while i >= 0:
            item = self.items[i]
            if lexicalize or not isinstance(item, int):
                to_output.append(item)
            if isinstance(item, int):
                j = i - 1
                while j >= 0:
                    if self.items[j] == self.items[i]:
                        i = j
                    j -= 1
            i -= 1
        if len(to_output) > max_length or not to_output:
            return None
        to_output.reverse()
        return '-'.join(to_output)


if __name__ == '__main__':
    mrs.main(RandomWalkAnalyzer)

# vim: et sw=4 sts=4
    def reduce(self, key, jobs):
        """ run the job """
        for job in jobs:
            # execute the job string
            print "\nRunning cmd %d\n\t%s" % (key,job)
            os.system(job)

        print "============= Done! ==================="
        yield ["Done"]


    '''
    Define a set of command line parameters whose values will be 
    passed to your program in the opts parameter of the __init__ method.
    '''
    @classmethod
    def update_parser(cls, parser):
        # TODO: add your option(s) here
        parser.add_option('--myopt',
                type='int',
                dest='myopt',
                default=1,
                help='Myopt determines blah blah blah...',
                )
        return parser

if __name__ == '__main__':
    mrs.main(JobLauncher)

# vim: et sw=4 sts=4
            for k, v in zip(tempK, i[1]):
                # print(k)
                matrixC[k[0]][k[1]] = v
        print(len(matrixA))
        # for i in matrixC:
        #     print(i)

        sys.stdout.flush()

        return 0

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-P',
                          '--num_processes',
                          dest='num_processes',
                          type='int',
                          help='Number of points for each map task',
                          default=2)

        parser.add_option('-N',
                          '--row_size',
                          dest='row_size',
                          type='int',
                          help='Number of map tasks to use',
                          default=16)
        return parser

if __name__ == '__main__':
    mrs.main(MatrixMultiplication)
Пример #9
0
        print(self.args[0])
        direc = self.args[0]
        fileList = glob.glob(direc + '/*.txt')
        print(fileList)
        print("----------------------------------")
        return job.file_data(fileList)

    def map(self, key, value):
        # print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        # print(value)
        splited = re.split('[\W0-9]', value, flags=re.UNICODE)
        for i in splited:
            if (i != ''):
                i = i.lower()
                # print(i)
                yield i, 1

    def reduce(self, key, values):
        length = len(tuple(values))
        # print length
        yield length


if __name__ == '__main__':
    mrs.main(WordCount)

# j = mrs.job.Job
# wc = WordCount
# wc.input_data(j)
# wc.map("AAA 2.1 2 , bc2 3z bbb",1)
Пример #10
0
    """

    @mrs.output_serializers(key=mrs.MapReduce.str_serializer,
                            value=mrs.MapReduce.int_serializer)
    def map(self, key, value):
        for word in value.split():
            word = word.strip(string.punctuation).lower()
            if word:
                yield (word, 1)

    def reduce(self, key, values):
        yield sum(values)

    combine = reduce

    def input_data(self, job):
        if len(self.args) < 2:
            print("Requires input(s) and an output.", file=sys.stderr)
            return None
        inputs = []
        for filename in self.args[:-1]:
            with open(filename) as f:
                for line in f:
                    inputs.append(line.strip())
        return job.file_data(inputs)

if __name__ == '__main__':
    mrs.main(WordCount2)

# vim: et sw=4 sts=4
Пример #11
0
        job.wait(output)
        output.fetchall()
        for key, value in output.data():
            if key == True:
                inside = value
            else:
                outside = value

        pi = 4 * inside / (inside + outside)
        print(pi)
        sys.stdout.flush()

        return 0

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-p', '--points',
                        dest='num_points',
                        help='Number of points for each map task',
                        default=1000)

        parser.add_option('-t', '--tasks',
                        dest='num_tasks', type='int',
                        help='Number of map tasks to use',
                        default=40)

        return parser

if __name__ == '__main__':
    mrs.main(SamplePi)
Пример #12
0
            dest='pruner', action='extend', search=['specmethod'],
            help='Pruning method for generating speculative children',
            default='OneCompleteIteration',
            )
    parser.add_option('','--total-tokens',
            dest='tokens', type='int',
            help='Number of tokens to use (only for the TokenPruner).  This is'
            ' the difference between the number of desired particles and the'
            ' number of available processors.',
            default=0,
            )
    parser.add_option('','--min-tokens',
            dest='min_tokens', type='int',
            help='The minimum number of tokens that each particle can have. '
            'This cannot be greater than the total number of tokens available.',
            default=0,
            )

    # There are some sticky issues involved with doing this speculatively
    # that I haven't worried about.  If we ever feel like we should do this,
    # we need make some changes to the code.  Until then, disabling it is
    # better than leaving it in and having it not work.
    parser.remove_option('--transitive-best')

    return parser

if __name__ == '__main__':
    mrs.main(SpecExPSO, update_parser=update_parser)

# vim: et sw=4 sts=4
Пример #13
0
        Q, anchors = self.get_qank()
        V, K = Q.shape[0], len(anchors)

        P_w = np.diag(Q.sum(axis=1))
        for word in range(V):
            if np.isnan(P_w[word, word]):
                P_w[word, word] = 1e-16

        C = np.zeros((V, K))
        for part in values:
            C += np.loads(part)

        A = np.dot(P_w, C)
        for k in range(K):
            A[:, k] = A[:, k] / A[:, k].sum()
        yield C.dumps()

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-t',
                          '--tasks',
                          dest='num_tasks',
                          type=int,
                          help='Number of map tasks to use',
                          default=20)
        return parser


if __name__ == '__main__':
    mrs.main(TopicRecover)
    def reduce(self, key, values):
        # All we do is aggregate all of the information we've seen
        topic_info = TopicInfo()
        for value in values:
            topic_info.aggregate(value)
        # Then output it in as a pickle, for easy analysis later.
        yield topic_info

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-d', '--dataset',
                dest='dataset',
                help='Database name of the dataset to use',
                )
        parser.add_option('-a', '--analysis',
                dest='analysis',
                help='Database name of the analysis to use',
                )
        parser.add_option('-o', '--outdir',
                dest='outdir',
                help='Directory to store the output',
                )
        return parser


if __name__ == '__main__':
    mrs.main(DependencyParse)

# vim: et sw=4 sts=4
Пример #15
0
                        continue
                    Q[w_i.token, w_j.token] += norm

        yield '', pickle.dumps(scipy.sparse.coo_matrix(Q))

    @mrs.output_serializers(key=mrs.str_serializer, value=mrs.raw_serializer)
    def reduce(self, key, values):
        corpus = self.get_corpus()
        V = len(corpus.vocabulary)
        Q = np.zeros((V, V))

        for Q_part in values:
            Q += pickle.loads(Q_part)
        Q /= Q.sum()

        yield Q.dumps()

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-t',
                          '--tasks',
                          dest='num_tasks',
                          type=int,
                          help='Number of map tasks to use',
                          default=20)
        return parser


if __name__ == '__main__':
    mrs.main(ConstructQ)
Пример #16
0
                word = word.split()

                for w in word:
                    w = w.strip(string.punctuation).lower()
                    if w and w not in stopWords and not w[0].isdigit():
                        yield (w, line_num + 1)  #Start counting at 1

            else:
                word = word.strip(string.punctuation).lower()
                if word and word not in stopWords and not word[0].isdigit():
                    yield (word, line_num + 1)  #Start counting at 1

    def reduce(self, word, line_num):

        lineNumbers = []
        for i in line_num:

            #At most 50 lines
            if len(lineNumbers) >= 50:
                break
            #If a word repeats on a line, write the line only once
            if i not in lineNumbers:
                lineNumbers.append(i)

        yield lineNumbers


if __name__ == '__main__':
    mrs.main(WordIndex)
Пример #17
0
			if int(j) < 10:
				j = '0' + j
			if int(i) < 10:
				i = '0' + i
			if line:
				if MatrixMultiply.CurrentFile == '1':
					for k in range(MatrixMultiply.Col):
						if k < 10:
							yield ('0'+str(k)+' '+j, i+' '+val)
						else:
							yield (str(k)+' '+j, i+' '+val)
				if MatrixMultiply.CurrentFile == '2':
					for k in range(MatrixMultiply.Row):
						if k < 10:
							yield (i+' '+'0'+str(k), j+' '+val)
						else:
							yield (i+' '+str(k), j+' '+val)

	def reduce(self, key, values):
		val = []
		for k in range(MatrixMultiply.Col):
			indexTemp, valTemp = values.next().split(' ')
			val.append(int(valTemp))
		for k in range(MatrixMultiply.Col):
			indexTemp, valTemp = values.next().split(' ')
			val[k] *= int(valTemp)
		yield sum(val)

if __name__ == '__main__':
    mrs.main(MatrixMultiply)
Пример #18
0
            default=0.4,
            )
    parser.add_option('-f', '--func', metavar='FUNCTION',
            dest='func', action='extend', search=['amlpso.functions'],
            help='Function to optimize',
            default='sphere.Sphere',
            )
    parser.add_option('-t', '--top', metavar='TOPOLOGY',
            dest='top', action='extend', search=['amlpso.topology'],
            help='Initialization parameters',
            default='Isolated',
            )
    parser.add_option('-o', '--out', metavar='OUTPUTTER',
            dest='out', action='extend', search=['amlpso.output'],
            help='Style of output',
            default='Basic',
            )
    parser.add_option('--hey-im-testing',
            dest='hey_im_testing', action='store_true',
            help='Ignore errors from uncommitted changes (for testing only!)',
            default=False,
            )

    return parser


if __name__ == '__main__':
    mrs.main(PI, update_parser)

# vim: et sw=4 sts=4
        # Then output it in as a pickle, for easy analysis later.
        yield topic_info

    @classmethod
    def update_parser(cls, parser):
        parser.add_option(
            '-d',
            '--dataset',
            dest='dataset',
            help='Database name of the dataset to use',
        )
        parser.add_option(
            '-a',
            '--analysis',
            dest='analysis',
            help='Database name of the analysis to use',
        )
        parser.add_option(
            '-o',
            '--outdir',
            dest='outdir',
            help='Directory to store the output',
        )
        return parser


if __name__ == '__main__':
    mrs.main(DependencyParse)

# vim: et sw=4 sts=4
Пример #20
0
                inside = value
            else:
                outside = value

        pi = 4 * inside / (inside + outside)
        print(pi)
        sys.stdout.flush()

        return 0

    @classmethod
    def update_parser(cls, parser):
        parser.add_option('-p',
                          '--points',
                          dest='num_points',
                          help='Number of points for each map task',
                          default=1000)

        parser.add_option('-t',
                          '--tasks',
                          dest='num_tasks',
                          type='int',
                          help='Number of map tasks to use',
                          default=40)

        return parser


if __name__ == '__main__':
    mrs.main(SamplePi)
Пример #21
0
        for job in jobs:
            # execute the job string
            print "\nRunning cmd %d\n\t%s" % (key, job)
            os.system(job)

        print "============= Done! ==================="
        yield ["Done"]

    '''
    Define a set of command line parameters whose values will be 
    passed to your program in the opts parameter of the __init__ method.
    '''

    @classmethod
    def update_parser(cls, parser):
        # TODO: add your option(s) here
        parser.add_option(
            '--myopt',
            type='int',
            dest='myopt',
            default=1,
            help='Myopt determines blah blah blah...',
        )
        return parser


if __name__ == '__main__':
    mrs.main(JobLauncher)

# vim: et sw=4 sts=4
    def node_pair_map(self, key, value):
        """Emit an entry for each pair of nodes in the walks."""
        for i, start_node in enumerate(value):
            for end_node in value[i + 1:]:
                yield (start_node, end_node)

    def normalize_reduce(self, key, values):
        """Make a conditional probability distribution given the node `key`."""
        counts = defaultdict(int)
        for v in values:
            counts[v] += 1

        distribution = {}
        total = 0
        for node, count in counts.iteritems():
            if count >= MIN_COUNT:
                distribution[node] = count
                total += count

        for node in distribution:
            distribution[node] /= total

        if distribution:
            yield distribution


if __name__ == '__main__':
    mrs.main(RandomWalkAnalyzer)

# vim: et sw=4 sts=4
Пример #23
0
        "they'll", "they're", "they've", "this", "those", "through", "to",
        "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
        "we're", "we've", "were", "what", "what's", "when", "when's", "where",
        "where's", "which", "while", "who", "who's", "whom", "why", "why's",
        "with", "would", "you", "you'd", "you'll", "you're", "you've", "your",
        "yours", "yourself", "yourselves"
    ]


WORD_RE = re.compile(
    r"[\w']+")  #looks for words such as "word", word's "word's", word
STOP_WORDS = getStopWords()


class MRSInvertedIndex(mrs.MapReduce):
    """Count the number of occurrences of each word in a set of documents.
    """
    def map(self, line_num, line_text):
        for word in WORD_RE.findall(line_text):
            word = word.strip(string.punctuation).lower()
            if word.lower() not in STOP_WORDS:
                if not word.isdigit():
                    yield (word, line_num)

    def reduce(self, word, line_num_list):
        yield word, list(line_num_list)


if __name__ == '__main__':
    mrs.main(MRSInvertedIndex)