Exemplo n.º 1
0
def test_multi_thread_scheduler():
    """
    tasks chain, transfer count between tasks, then update count value
    """
    result_list = []

    class Task1(SourceTask):
        def _execute(self, data_bundle):
            data_bundle["count"] += 1
            result_list.append(data_bundle["count"])
            self._emit(data_bundle)

    class Task2(Task):
        def _execute(self, data_bundle, from_node):
            data_bundle["count"] += 1
            result_list.append(data_bundle["count"])
            self._emit(data_bundle)

    prev_task = Task1(init_data_bundle=DataBundle(data_dict={"count": 0}))
    graph = ComputationalGraph()
    for _ in range(99):
        current_task = Task2(stop_timeout_window=0.01)
        graph.add_edge(prev_task, current_task)
        prev_task = current_task

    scheduler = MultiThreadScheduler(graph)
    scheduler.schedule()

    assert result_list == list(range(1, 101, 1))
Exemplo n.º 2
0
    def recommend_based_on_previous_input(self, previous_code):
        graph = ComputationalGraph()

        query_task = InputTask(init_data_bundle=DataBundle(
            data_dict={"code": previous_code}))
        index_ranker = IndexRanker()
        graph.add_edge(query_task, index_ranker)

        scheduler = LinearScheduler(graph)
        scheduler.schedule()
Exemplo n.º 3
0
    def recommend_code_piece(self, code):
        graph = ComputationalGraph()
        query_task = InputTask(init_data_bundle=DataBundle(
            data_dict={"code": code}))
        rank_task = CoarseRanker()
        print_recommend_code_to_console = PrintRecommendCodeToConsoleTask()

        graph.add_edge(query_task, rank_task)
        graph.add_edge(rank_task, print_recommend_code_to_console)

        scheduler = LinearScheduler(graph)
        scheduler.schedule()
    def update_from_given_code(self, code):

        graph = ComputationalGraph()

        input_task = InputTask(init_data_bundle=DataBundle(
            data_dict={"code": code}))
        parsing_task = ParsingTask()
        corpus_dump_task = CorpusDumpTask()

        graph.add_edge(input_task, parsing_task)
        graph.add_edge(parsing_task, corpus_dump_task)

        scheduler = LinearScheduler(graph)
        scheduler.schedule()
Exemplo n.º 5
0
 def _execute(self, data_bundle):
     # preprocessing input code string first
     code = "\n".join(
         string_util.left_padding_strings(data_bundle["code"].split("\n")))
     # filepath is not mandatory
     filepath = data_bundle.data_dict.get("filepath", "N/A")
     try:
         self._emit(
             DataBundle(data_dict={
                 "code": code,
                 "filepath": filepath
             }))
         logger.debug("parse input {} succeeded".format(code))
     except:
         logger.error("parse input {} failed".format(code))
Exemplo n.º 6
0
    def update_index_from_default_dir(self):

        work_dir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                                "index_dir")
        graph = ComputationalGraph()

        app_set_task = AppSetPreprocessingTask(init_data_bundle=DataBundle(
            data_dict={"work_dir": work_dir}))
        extract_mapping_task = ExtractMappingTask()
        index_dump_task = IndexDumpTask()

        graph.add_edge(app_set_task, extract_mapping_task)
        graph.add_edge(extract_mapping_task, index_dump_task)

        scheduler = LinearScheduler(graph)
        scheduler.schedule()
    def update_from_default_dir(self):

        app_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), os.path.pardir,
                         "corpus_dir"))
        graph = ComputationalGraph()

        app_set_task = AppSetPreprocessingTask(init_data_bundle=DataBundle(
            data_dict={"work_dir": app_dir}))
        parsing_task = ParsingTask()
        corpus_dump_task = CorpusDumpTask()

        graph.add_edge(app_set_task, parsing_task)
        graph.add_edge(parsing_task, corpus_dump_task)

        scheduler = LinearScheduler(graph)

        scheduler.schedule()
 def _execute(self, data_bundle, from_node):
     code = data_bundle["code"]
     lines = code.split("\n")
     if len(lines) > 1:
         try:
             root = Parser.spt_parse("\n".join(lines[:-1]))
             features = FeatureExtraction.extract(root)
             self._emit(
                 DataBundle(
                     data_dict={
                         "features": features,
                         "line_number": len(lines) - 1,
                         "next_line": lines[-1]
                     }))
             logger.info("extract mapping succeeded, next line is: " +
                         lines[-1])
         except:
             logger.warn("extract mapping failed")
 def _execute(self, data_bundle, from_node):
     """
     parse source code, output source code and feature set
     """
     source_code = data_bundle["code"]
     filepath = data_bundle["filepath"]
     try:
         root = Parser.spt_parse(source_code)
         features = FeatureExtraction.extract(root)
         emit_data_bundle = DataBundle(data_dict={
             "code": source_code,
             "features": features,
             "filepath": filepath
         })
         self._emit(emit_data_bundle)
         logger.info(
             "parsing code succeeded, filepath: {}".format(filepath))
     except Exception as e:
         # if parsing failed, ignore it
         logger.warn(
             "parsing code pieces failed, filepath: {}".format(filepath))
Exemplo n.º 10
0
 def _execute(self, data_bundle, from_node):
     """
     choose top ranked code pieces based on set intersection
     """
     query_code = data_bundle["code"]
     code_lines = len(query_code.split("\n"))
     query_features = FeatureExtraction.extract(
         Parser.spt_parse(query_code))
     result_list = []
     for corpus_code, corpus_features, filepath in self._corpus_reader:
         similarity = len(corpus_features & query_features)
         result_list.append(RankElement(similarity, corpus_code, filepath))
     result_list.sort(
         key=lambda e:
         (e.similarity, -abs(e.corpus_code_line_number - code_lines)),
         reverse=True)
     emit_data_bundle = DataBundle(
         data_dict={
             "code": query_code,
             "rank_list": result_list[:self._rank_threshold]
         })
     self._emit(emit_data_bundle)
Exemplo n.º 11
0
 def _handle_python_code(self, filepath):
     """
     split the python codes into several chunks with different threshold
     """
     # Algorithm 1: sliding windows
     chunk_sizes = [1, 2, 5, 10, 20]
     code_piece_set = set()
     for chunk_size in chunk_sizes:
         lines = []
         with open(filepath, "r") as f:
             for line in f:
                 # preprocessing line string, remove \n and replace \t with four spaces
                 line = line.strip("\n").replace("\t", "    ")
                 # empty line and comments will be ignored
                 if not self._filter(line):
                     lines.append(line)
                     # maintain a sliding window
                     if len(lines) > chunk_size:
                         lines.pop(0)
                     code_piece = "\n".join(
                         string_util.left_padding_strings(lines))
                     # dedup code piece
                     if code_piece not in code_piece_set:
                         code_piece_set.add(code_piece)
                         data_bundle = DataBundle(data_dict={
                             "filepath": filepath,
                             "code": code_piece
                         })
                         self._emit(data_bundle)
     # Algorithm 2: check classes and functions
     text = open(filepath, "r").read()
     lines = text.split("\n")
     try:
         ast_root = ast.parse(text)
         class_linenos, function_linenos = [], []
         for node in ast.walk(ast_root):
             if isinstance(node, ast.ClassDef):
                 class_linenos.append(node.lineno - 1)
             elif isinstance(node, ast.FunctionDef):
                 function_linenos.append(node.lineno - 1)
         # collect classes and functions
         for lineno in (class_linenos + function_linenos):
             code_block = []
             left_padding = string_util.get_left_padding_spaces(
                 lines[lineno])
             code_block.append(lines[lineno])
             lineno += 1
             while lineno < len(lines) and \
                 (string_util.is_empty_string(lines[lineno]) or
                      (string_util.get_left_padding_spaces(lines[lineno]) > left_padding)):
                 if not string_util.is_empty_string(lines[lineno]):
                     code_block.append(lines[lineno])
                 lineno += 1
             code_piece = "\n".join(
                 string_util.left_padding_strings(code_block))
             # dedup code piece
             if code_piece not in code_piece_set:
                 data_bundle = DataBundle(data_dict={
                     "filepath": filepath,
                     "code": code_piece
                 })
                 self._emit(data_bundle)
     except:
         logger.warn("handle python file: {} failed".format(filepath))
 def _execute(self, data_bundle):
     for i in range(100):
         self._emit(DataBundle(data_dict={"value": i}),
                    RandomDispatchStrategy)
                       RandomDispatchStrategy)


class IntermediateTask(Task):
    def __init__(self, name):
        super().__init__(name, stop_timeout_window=0.2)

    def _execute(self, data_bundle, from_node):
        self._emit(data_bundle)


class TerminationTask(Task):
    def _execute(self, data_bundle, from_node):
        print(data_bundle["value"])


graph = ComputationalGraph()
source = SourceTaskStub(init_data_bundle=DataBundle(data_dict={}))

tasks = []
for _ in range(10):
    tasks.append(IntermediateTask("name " + str(_)))
termination_task = TerminationTask("termination")

for i in range(10):
    graph.add_edge(source, tasks[i])
for i in range(10):
    graph.add_edge(tasks[i], termination_task)

scheduler = MultiThreadScheduler(graph)
scheduler.schedule()
 def execute(self):
     self._execute(self._init_data_bundle)
     # source task won't have more input data
     # so we can stop the entire pipeline
     self._emit(DataBundle.stop_signal())
     self._stop()