Пример #1
0
 def test_content_resuse_scoring_data():
     cols = [
         'STB_Id', 'STB_Grade', 'STB_Section', 'STB_Text', 'Ref_id',
         'Ref_Grade', 'Ref_Section', 'Ref_Text'
     ]
     case1 = pd.read_csv(test_case_data_location + "df_feature_check/" +
                         "content_reuse_preparation_feature_check.csv")
     assert df_feature_check(case1, cols)
Пример #2
0
 def test_content_reuse_evaluation_data():
     cols = [
         'state_topic_id', 'reference_topic_id', 'pred_label_percentage',
         'TP_count', 'FP_count', 'TN_count', 'FN_count', 'actual_label'
     ]
     case1 = pd.read_csv(test_case_data_location + "df_feature_check/" +
                         "content_reuse_evaluation_feature_check.csv")
     assert df_feature_check(case1, cols)
 def test_df_feature_check():
     case1 = pd.read_csv(
         test_case_data_location +
         "df_feature_check/" +
         "Content_Meta_feature_checking_df_1.csv")
     case2 = pd.read_csv(
         test_case_data_location +
         "df_feature_check/" +
         "Content_Meta_feature_checking_df_2.csv")
     case3 = pd.read_csv(
         test_case_data_location +
         "df_feature_check/" +
         "Content_Meta_feature_checking_df_3.csv")
     mandatatory_field_location = test_case_data_location + \
                                  "df_feature_check/" + "ContentTagging_mandatory_fields.yaml"
     with open(mandatatory_field_location, 'r') as stream:
         data = yaml.load(stream)
     mandatatory_field_ls = list(data['mandatory_fields'])
     assert df_feature_check(case1, mandatatory_field_ls)
     assert df_feature_check(case2, mandatatory_field_ls) == False
     assert df_feature_check(case3, mandatatory_field_ls) == False
    def run(self, range_start, range_end, num_of_processes, content_type):
        """
        This is the main method to derive when creating an operator. This takes in the parameters, 
        runs text enrichment pipline and writes back the path to the 
        timestamp folder with the content id and its enriched text to an h5 file that gets saved as an intermediate result  

        """
        DS_DATA_HOME = self.inputs["DS_DATA_HOME"].read_loc()
        pathTocredentials = self.inputs["pathTocredentials"].read_loc()
        timestr = time.strftime("%Y%m%d-%H%M%S")
        path_to_timestamp_folder = os.path.join(DS_DATA_HOME, timestr)
        content_to_text_path = os.path.join(path_to_timestamp_folder,
                                            "content_to_text")
        # content dump:
        if not os.path.exists(content_to_text_path):
            os.makedirs(content_to_text_path)
            print("content_to_text: ", content_to_text_path)

        contentmeta_path = self.inputs["localpathTocontentMeta"].read_loc()
        # move the content meta to timestamp folder[destination folder]
        #for the time being experiment with copy: change it later.
        shutil.move(
            contentmeta_path,
            os.path.join(path_to_timestamp_folder,
                         os.path.split(contentmeta_path)[1]))
        moved_contentmeta_path = os.path.join(
            path_to_timestamp_folder,
            os.path.split(contentmeta_path)[1])

        content_meta = pd.read_csv(moved_contentmeta_path)
        if "derived_contentType" not in list(content_meta.columns):
            content_meta["derived_contentType"] = np.nan
            for row_ind, artifact_url in enumerate(
                    content_meta["artifactUrl"]):
                try:
                    content_meta["derived_contentType"][
                        row_ind] = identify_contentType(artifact_url)
                except BaseException:
                    pass
        content_meta = content_meta[pd.notnull(
            content_meta['derived_contentType'])]
        content_meta.reset_index(inplace=True, drop=True)
        print(self.outputs["timestamp_folder"].location_specify())
        oldwd = os.getcwd()
        contentMeta_mandatory_fields = [
            'artifactUrl', 'derived_contentType', 'downloadUrl', 'gradeLevel',
            'identifier', 'language', 'subject', 'graph_id', 'nodeType',
            'objectType', 'node_id'
        ]
        assert df_feature_check(content_meta, contentMeta_mandatory_fields)

        logging.info("CTT_CONTENT_TO_TEXT_START")
        # read content meta:
        if content_meta.columns[0] == "0":
            content_meta = content_meta.drop("0", axis=1)

        # check for duplicates in the meta
        if list(content_meta[content_meta.duplicated(
            ['artifactUrl'], keep=False)]["artifactUrl"]) != []:
            content_meta.drop_duplicates(subset="artifactUrl", inplace=True)
            content_meta.reset_index(drop=True, inplace=True)

        # dropna from artifactUrl feature and reset the index:
        content_meta.dropna(subset=["artifactUrl"], inplace=True)
        content_meta.reset_index(drop=True, inplace=True)

        # time the run
        start = time.time()
        logging.info('Contents detected in the content meta: ' +
                     str(len(content_meta)))
        logging.info(
            "----Running Content_to_Text for contents from {0} to {1}:".format(
                range_start, range_end))
        logging.info("time started: {0}".format(start))
        # subset contentMeta:
        # content_meta = content_meta[content_meta["derived_contentType"].isin(
        #     subset_contentMeta_by.split(", "))]
        content_meta.reset_index(drop=True, inplace=True)
        if range_start == "START":
            range_start = 0
        if range_end == "END":
            range_end = len(content_meta)
        logging.info(
            "CTT_Config: content_meta from {0} to {1} created in: {2}".format(
                range_start, range_end, content_to_text_path))
        print("Number of processes: ", num_of_processes)

        status = False
        if os.path.exists(pathTocredentials):
            try:
                config = configparser.ConfigParser(allow_no_value=True)
                config.read(pathTocredentials)
                status = True
                try:
                    path_to_googlecred = config[
                        'google application credentials'][
                            "GOOGLE_APPLICATION_CREDENTIALS"]
                    with open(path_to_googlecred, "r") as cred_json:
                        GOOGLE_APPLICATION_CREDENTIALS = cred_json.read()
                except BaseException:
                    logging.info(
                        "Invalid GOOGLE_APPLICATION_CREDENTIALS in config.")
                    logging.info(
                        "***Checking for GOOGLE_APPLICATION_CREDENTIALS environment variable"
                    )
                    status = False
            except BaseException:
                logging.info("Invalid config file")
                logging.info(
                    "***Checking for GOOGLE_APPLICATION_CREDENTIALS environment variable"
                )

        if not status:
            try:
                GOOGLE_APPLICATION_CREDENTIALS = os.environ[
                    'GOOGLE_APPLICATION_CREDENTIALS']
                with open(GOOGLE_APPLICATION_CREDENTIALS, "r") as f:
                    GOOGLE_APPLICATION_CREDENTIALS = f.read()
            except BaseException:
                GOOGLE_APPLICATION_CREDENTIALS = ""
                logging.info("Not a valid google credential")

        result = [
            multimodal_text_enrichment(i, timestr, content_meta, content_type,
                                       content_to_text_path,
                                       GOOGLE_APPLICATION_CREDENTIALS)
            for i in range(range_start, range_end)
        ]
        print(result)
        os.chdir(oldwd)
        print("Current directory c2t: ", os.getcwd())
        print("timestamp_folder path:", path_to_timestamp_folder)
        self.outputs["timestamp_folder"].write(path_to_timestamp_folder)