Пример #1
0
def stego_make_mp3stego(wav_files_path,
                        mp3_files_path,
                        bitrate,
                        start_idx=0,
                        end_idx=10000):

    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        embedding_rates = ["01", "03", "05", "08", "10"]
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            for embedding_rate in embedding_rates:
                file_name = get_file_name(wav_file_path)
                mp3_file_name = file_name.replace(".wav", ".mp3")
                mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
                command = "encode_HCM.exe -b " + bitrate + " -E " + embedding_file_path + "--ER" + embedding_rate + " " + wav_file_path + " " + mp3_file_path
                eval(command)
        print(
            "stego samples are made completely, bitrate %s, stego algorithm %s."
            % (bitrate, "HCM"))
Пример #2
0
def calibration(mp3_files_path,
                calibration_files_path,
                bitrate,
                start_idx=0,
                end_idx=10000):
    """
    mp3 calibration via lame encoder  -> lame.exe -b 128 ***.mp3 c_***.mp3
    :param mp3_files_path: the mp3 files path
    :param calibration_files_path: the calibrated mp3 files path
    :param bitrate: bitrate
    :param start_idx: start index
    :param end_idx: end index
    :return:
    """
    if not os.path.exists(mp3_files_path):
        print("The mp3 files path does not exist.")
    else:
        mp3_files_list = get_files_list(file_dir=mp3_files_path,
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(calibration_files_path):
            os.mkdir(calibration_files_path)
        for mp3_file_path in mp3_files_list:
            mp3_file_name = get_file_name(mp3_file_path)
            calibrated_mp3_file_path = fullfile(calibration_files_path,
                                                mp3_file_name)
            if not os.path.exists(calibrated_mp3_file_path):
                command = "encode.exe -b " + bitrate + " " + mp3_file_path + " " + calibrated_mp3_file_path
                os.system(command)
            else:
                pass
        print("calibration with bitrate %s are completed." % bitrate)
def stego_make_acs(wav_files_path, mp3_files_path, bitrate, width, height="7",
                   embed=embedding_file_path, embedding_rate="10", start_idx=None, end_idx=None):
    """
    make stego samples (ACS)
    :param wav_files_path: path of wav audio files
    :param mp3_files_path: path of mp3 audio files
    :param bitrate: bitrate
    :param width: width of parity-check matrix
    :param height: height of parity-check matrix, default is "7"
    :param embed: path of embedding file
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: start index of audio files
    :param end_idx: end index of audio files
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "C:/Users/Charles_CatKing/Desktop/ACS/lame.exe -b " + bitrate + " -embed " + embed + " -width " + width + " -height " + height + \
                          " -er " + embedding_rate + " -region 2 -layerii 1 -threshold 2 -key 123456 " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
Пример #4
0
def cover_make_mp3stego(wav_files_path,
                        mp3_files_path,
                        bitrate,
                        start_idx=0,
                        end_idx=10000):
    """
    make mp3 cover samples via mp3stego encoder
    :param wav_files_path: path of wav audio files
    :param mp3_files_path:path of mp3 audio files
    :param bitrate: bitrate (128, 192, 256, 320)
    :param start_idx: the start index of audio files to be processed
    :param end_idx: the end index of audio files to be processed
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "encode_MP3Stego.exe -b " + bitrate + " " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
        print("MP3Stego cover samples with bitrate %s are completed." %
              bitrate)
def stego_make_hcm(wav_files_path, mp3_files_path, bitrate, cost="2",
                   embed=embedding_file_path, frame_num="50", embedding_rate="10", start_idx=None, end_idx=None):
    """
    make stego samples (HCM)
    :param wav_files_path: path of wav audio files
    :param mp3_files_path: path of mp3 audio files
    :param bitrate: bitrate
    :param cost: type of cost function, default is "2"
    :param embed: path of embedding file
    :param frame_num: frame number of embedding message, default is "50"
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: start index of audio files
    :param end_idx: end index of audio files
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "encode_HCM.exe -b " + bitrate + " -embed " + embed + " -cost " + cost + " -er " + embedding_rate \
                          + " -framenumber " + frame_num + " " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
def stego_make_eecs(wav_files_path, mp3_files_path, bitrate, width, height="7",
                    embed=embedding_file_path, frame_num="50", embedding_rate="10", start_idx=None, end_idx=None):
    """
    make stego samples (EECS)
    :param wav_files_path: path of wav audio files
    :param mp3_files_path: path of mp3 audio files
    :param bitrate: bitrate
    :param width: width of parity-check matrix
    :param height: height of parity-check matrix, default is "7"
    :param embed: path of embedding file
    :param frame_num: frame number of embedding message, default is "50"
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: start index of audio files
    :param end_idx: end index of audio files
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path, start_idx=start_idx, end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "encode_EECS.exe -b " + bitrate + " -embed " + embed + " -width " + width + " -height " + height + " -er " + embedding_rate \
                          + " -framenumber " + frame_num + " " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
def get_test_files(dir):
    files = utils.get_files_list(dir)

    test_files = []
    for file in files:
        res = file.endswith(test_expansion)
        if res == True:
            test_files.append(file)
    return test_files
Пример #8
0
def stego_make_mp3stego(wav_files_path,
                        mp3_files_path,
                        bitrate,
                        embedding_rate="10",
                        start_idx=None,
                        end_idx=None):
    """
    make stego samples via MP3Stego
    for 10s wav audio, secret messages of 1528 bits (191 Bytes) will be embedded, and the length of secret messages is independent of bitrate
    analysis unit: 50 frames (for 10s mp3 audio, there are 384 frames), 24.83 bytes messages will be embedded
    relative embedding rate         secret messages length      is_selected
             10%                           3  Bytes                  1
             20%                           5  Bytes
             30%                           8  Bytes                  1
             40%                           10 Bytes
             50%                           13 Bytes                  1
             60%                           14 Bytes
             70%                           17 Bytes
             80%                           20 Bytes                  1
             90%                           22 Bytes
             100%                          24 Bytes                  1
    in the process of MP3stego, the messages are compressed
    :param wav_files_path: path of wav audio files
    :param mp3_files_path:path of mp3 audio files
    :param bitrate: bitrate (128, 192, 256, 320)
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: the start index of audio files to be processed
    :param end_idx: the end index of audio files to be processed
    :return: NULL
    """
    embedding_rates = ["1", "3", "5", "8", "10"]
    message_lengths = [3, 8, 13, 20, 24]
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        file_type="wav",
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)

        message_len = message_lengths[embedding_rates.index(embedding_rate)]
        embedding_file = message_random(embedding_file_path, message_len)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "encode_MP3Stego.exe -b " + bitrate + " -E " + embedding_file + " -P pass " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
        print(
            "stego samples are made completely, bitrate %s, stego algorithm %s."
            % (bitrate, "MP3Stego"))
Пример #9
0
    def parse_gold_annotation(self, paths):
        interaction_words = defaultdict(int)
        paths = utils.get_files_list(paths)
        entity_collection = {}
        interaction_collection = {}
        for file in paths:

            document_data = open(file, "r")
            xml_data = parse(document_data)
            document_elt = xml_data.getElementsByTagName("document")

            document_attrs = dict(document_elt[0].attributes.items())
            document_id = document_attrs["id"]

            sentences = xml_data.getElementsByTagName("sentence")

            for sentence in sentences:
                entity_collection = {}
                sentence_attrs = dict(sentence.attributes.items())
                text = sentence_attrs["text"]
                entities = sentence.getElementsByTagName("entity")
                for entity in entities:
                    entity_attrs = dict(entity.attributes.items())
                    id = entity_attrs["id"]
                    text = (entity_attrs["text"]).lower()
                    type = (entity_attrs["type"]).lower()
                    entity_collection[id] = {}
                    entity_collection[id] = {"text": text, "type": type}

                interacting_pairs = sentence.getElementsByTagName("pair")
                for pair in interacting_pairs:
                    pair_attrs = dict(pair.attributes.items())
                    type = None
                    if pair_attrs.has_key("type"):
                        type = pair_attrs["type"]
                    if document_id not in interaction_collection.keys():
                        interaction_collection[document_id] = {}
                    if sentence_attrs["id"] not in interaction_collection[document_id].keys():
                        interaction_collection[document_id][sentence_attrs["id"]] = {}
                    # if pair_attrs["id"] not in interaction_collection[document_id][sentence_attrs["id"]].keys():
                    #   interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["id"]]={}
                    if pair_attrs["e1"] not in interaction_collection[document_id][sentence_attrs["id"]].keys():
                        interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]] = {}
                    if (
                        pair_attrs["e2"]
                        not in interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]]
                    ):
                        interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]][
                            pair_attrs["e2"]
                        ] = {}
                        interaction_collection[document_id][sentence_attrs["id"]][pair_attrs["e1"]][
                            pair_attrs["e2"]
                        ] = {"ddi": pair_attrs["ddi"], "type": type}
                document_data.close()
                pickle.dump(interaction_collection, open("models/test_data.p", "wb"))
def get_executables(dir):
    files = utils.get_files_list(dir)

    rdo_ex = None
    for file in files:
        res = file.endswith(rdo_ex_substr)
        if res == True:
            rdo_ex = file
        if rdo_ex:
           break

    return rdo_ex
Пример #11
0
def get_executables(dir):
	files = utils.get_files_list(dir)

	rdo_ex = None
	for file in files:
		res = file.endswith(rdo_ex_substr)
		if res == True:
			rdo_ex = file
		if rdo_ex:
		   break

	return rdo_ex
Пример #12
0
def stego_make_acs(wav_files_path,
                   mp3_files_path,
                   bitrate,
                   width,
                   height="7",
                   embed=embedding_file_path,
                   embedding_rate="10",
                   frame_embedding_rate="10",
                   region="2",
                   threshold="2",
                   start_idx=None,
                   end_idx=None):
    """
    make stego samples (ACS)
    :param wav_files_path: path of wav audio files
    :param mp3_files_path: path of mp3 audio files
    :param bitrate: bitrate
    :param width: width of parity-check matrix
    :param height: height of parity-check matrix, default is "7"
    :param embed: path of embedding file
    :param embedding_rate: embedding rate, default is "10"
    :param frame_embedding_rate: embedding rate in a frame, defualt is "10"
    :param region: embeding region, default is "2", "0": Big-Value region, "1": Count1 region, "2": All regions
    :param threshold: threshold value for embedding, embedded coefficients are in [-threshold, threshold], default is "2"
    :param start_idx: start index of audio files
    :param end_idx: end index of audio files
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        file_type="wav",
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                temp_secret_file_path = message_random(embed)
                key = random.randint(1000000, 9999999)
                command = "encode_ACS.exe -b " + bitrate + " -embed " + temp_secret_file_path + " -width " + width + " -height " + height + \
                          " -er " + embedding_rate + " -fer " + frame_embedding_rate + " -region " + region + " -threshold " + threshold + " -key " + key + \
                          wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
Пример #13
0
def stego_make_mp3stego(wav_files_path,
                        mp3_files_path,
                        bitrate,
                        embedding_rate="10",
                        start_idx=0,
                        end_idx=10000):
    """
    make stego samples via MP3Stego
    for 10s wav audio, secret messages of 1528 bits (191 Bytes) will be embedded, and the length of secret messages is independent of bitrate
    analysis unit: 50 frames (for 10s mp3 audio, there are 384 frames), 24.83 bytes messages will be embedded
    relative embedding rate         secret messages length
             10%                           3  Bytes
             30%                           8  Bytes
             50%                           13 Bytes
             80%                           20 Bytes
             100%                          24 Bytes
    in the process of MP3stego, the messages are compressed
    :param wav_files_path: path of wav audio files
    :param mp3_files_path:path of mp3 audio files
    :param bitrate: bitrate (128, 192, 256, 320)
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: the start index of audio files to be processed
    :param end_idx: the end index of audio files to be processed
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        embedding_file_name = "stego_0" + embedding_rate + ".txt" if len(
            embedding_rate) == 1 else "stego_" + embedding_rate + ".txt"
        embedding_file = fullfile(embedding_files_mp3stego_path,
                                  embedding_file_name)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                command = "encode_MP3Stego.exe -b " + bitrate + " -E " + embedding_file + " -P pass " + wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
        print(
            "stego samples are made completely, bitrate %s, stego algorithm %s."
            % (bitrate, "MP3Stego"))
Пример #14
0
def delete_model_data(model_path):
	dir = os.path.dirname(model_path)
	files = utils.get_files_list(dir)
	utils.enc_print ('\nFound and deleted model data:')

	deleted = 0	
	for file in os.listdir(dir):
		file_path = os.path.join(dir, file)
		if (os.path.isfile(file_path)):
			if (not file_path.endswith(test_expansion) and not file_path.endswith(project_expansion)):
				utils.enc_print(file_path)
				os.remove(file_path)
				deleted = deleted + 1
	
	if deleted == 0:
		utils.enc_print('nothing deleted')
		
	utils.enc_print('\n')
Пример #15
0
def stego_make_ahcm(wav_files_path,
                    mp3_files_path,
                    bitrate,
                    width,
                    height="7",
                    embed=embedding_file_path,
                    embedding_rate="10",
                    start_idx=None,
                    end_idx=None):
    """
    make stego samples (AHCM)
    :param wav_files_path: path of wav audio files
    :param mp3_files_path: path of mp3 audio files
    :param bitrate: bitrate
    :param width: width of parity-check matrix
    :param height: height of parity-check matrix, default is "7"
    :param embed: path of embedding file
    :param embedding_rate: embedding rate, default is "10"
    :param start_idx: start index of audio files
    :param end_idx: end index of audio files
    :return: NULL
    """
    if not os.path.exists(wav_files_path):
        print("The wav files path does not exist.")
    else:
        wav_files_list = get_files_list(file_dir=wav_files_path,
                                        file_type="wav",
                                        start_idx=start_idx,
                                        end_idx=end_idx)
        if not os.path.exists(mp3_files_path):
            os.mkdir(mp3_files_path)
        for wav_file_path in wav_files_list:
            file_name = get_file_name(wav_file_path)
            mp3_file_name = file_name.replace(".wav", ".mp3")
            mp3_file_path = fullfile(mp3_files_path, mp3_file_name)
            if not os.path.exists(mp3_file_path):
                temp_secret_file_path = message_random(embed)
                key = random.randint(1000000, 9999999)
                command = "encode_AHCM.exe -b " + bitrate + " -embed " + temp_secret_file_path + " -width " + width + " -height " + height + \
                          " -er " + embedding_rate + " -key " + key + \
                          wav_file_path + " " + mp3_file_path
                os.system(command)
            else:
                pass
Пример #16
0
def delete_model_data(model_path):
    dir = os.path.dirname(model_path)
    files = utils.get_files_list(dir)
    utils.enc_print('\nFound and deleted model data:')

    deleted = 0
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        if (os.path.isfile(file_path)):
            if (not file_path.endswith(test_expansion)
                    and not file_path.endswith(project_expansion)):
                utils.enc_print(file_path)
                os.remove(file_path)
                deleted = deleted + 1

    if deleted == 0:
        utils.enc_print('nothing deleted')

    utils.enc_print('\n')
def evaluate_heuristics(paths):

    interaction_words=defaultdict(int)
    paths=utils.get_files_list(paths)
    entity_collection={}
    interaction_collection={}
    cue_words=[]
    with open("ddi_key_phrase") as key_data:
         for line in key_data:
             cue_words+=[line.strip()]
    with open("ddi_trigger") as key_data:
         for line in key_data:
             cue_words+=[line.strip()]
    for file in paths:

        document_data=open(file,'r')
        xml_data=parse(document_data)
        document_elt= xml_data.getElementsByTagName("document")

        document_attrs=dict(document_elt[0].attributes.items())
        document_id=document_attrs["id"]

        sentences = xml_data.getElementsByTagName("sentence")

        for sentence in sentences:
            entity_collection={}
            sentence_attrs = dict(sentence.attributes.items())
            text=sentence_attrs["text"]
            #for word in cue_words:
                #if word in text
            entities = sentence.getElementsByTagName("entity")
            for entity in entities:
                entity_attrs = dict(entity.attributes.items())
                id=entity_attrs["id"]
                text=(entity_attrs["text"]).lower()
                type=(entity_attrs["type"]).lower()
                entity_collection[id]={}
                entity_collection[id]={"text":text,"type":type}

            document_data.close()
Пример #18
0
def make_seeding_folder(torrent_name, src_dir, dst_dir):
    with open(torrent_name, 'rb') as fh:
        torrent_bytes = fh.read()

    torrent = bencode.decode(torrent_bytes)
    tor_name = torrent['info']['name']

    print(tor_name)

    dst_dir_1 = os.path.join(dst_dir, tor_name)
    os.makedirs(name=dst_dir_1, exist_ok=True)

    prf = Profiler()
    files = get_files_list(src_dir)
    prf.log('get_files_list() delay:')
    files = matching_files_by_size(torrent, files)
    prf.log('matching_files_by_size() delay:')

    pieces = make_pieces_from_metadata(torrent, files)
    prf.log('make_pieces_from_metadata() delay:')

    for idx in range(len(pieces)):
        pieces[idx].find_match()
#        print('Checked ', idx, ' piece from ', len(pieces))
    prf.log('find_matches() delay:')

    for f in files:
        if f.is_matched():
            dst_file = os.path.join(dst_dir, f.get_torrent_name())
            src_file = f.get_matched_name()

            dst_path = os.path.join(dst_dir, f.get_path_from_torrent())
            if not os.path.exists(dst_path):
                os.makedirs(name=dst_path, exist_ok=True)


#            print(src_file, '=>', dst_file)
            shutil.copy(src_file, dst_file)

    prf.log('copy files delay:')
Пример #19
0
def make_seeding_folder(torrent_name, src_dir, dst_dir):
    with open(torrent_name, 'rb') as fh:
        torrent_bytes = fh.read()
            
    torrent = bencode.decode(torrent_bytes)
    tor_name = torrent['info']['name']
     
    print(tor_name)
    
    dst_dir_1 = os.path.join(dst_dir, tor_name)
    os.makedirs(name = dst_dir_1, exist_ok = True)

    prf = Profiler()
    files = get_files_list(src_dir)
    prf.log('get_files_list() delay:')
    files = matching_files_by_size(torrent, files)
    prf.log('matching_files_by_size() delay:')
    
    pieces = make_pieces_from_metadata(torrent, files)
    prf.log('make_pieces_from_metadata() delay:')
    
    for idx in range(len(pieces)):
        pieces[idx].find_match()
#        print('Checked ', idx, ' piece from ', len(pieces))
    prf.log('find_matches() delay:')
        
    for f in files:
        if f.is_matched():
            dst_file = os.path.join(dst_dir, f.get_torrent_name())
            src_file = f.get_matched_name()
            
            dst_path = os.path.join(dst_dir, f.get_path_from_torrent())
            if not os.path.exists(dst_path):
                os.makedirs(name = dst_path, exist_ok = True)
            
#            print(src_file, '=>', dst_file)
            shutil.copy(src_file, dst_file)

    prf.log('copy files delay:')
Пример #20
0
def check_and_fix_unformatted_logs(apps):
    for app in apps:
        try:
            os.mkdir(os.path.join("logs", "formatted_logs", app))
        except FileExistsError:
            print("folder Exists skipping")

        files = utils.get_files_list(app, formatted=False)
        for file in files:
            with open(file, "r") as f:
                body = f.read()
            body = html.fromstring(body)
            body = remove_scripts_and_style(body)
            if app == "mchec":
                body = fix_mchec_error_tables(body)
            elif app == "pdr":
                body = fix_pdr_server_tables_and_NA(body)
            new_file_name = os.path.join(os.getcwd(), "logs", "formatted_logs",
                                         app,
                                         file.split('\\')[-1])
            with open(new_file_name, "w+") as f:
                f.write(html.tostring(body, pretty_print=True).decode())
    return
def text_read_all(text_files_dir, height=200, width=576, separator=","):
    """
    read all txt files into the memory (not recommend)

    :param text_files_dir: the folder of txt files
    :param height: the height of QMDCT matrix
    :param width: the width of QMDCT matrix
    :param separator: separator of each elements in text file
    :return:
        data: QMDCT matrices, ndarry, shape: [files_num, height, width, 1]
    """
    text_files_list = get_files_list(text_files_dir)  # get the files list
    files_num = len(text_files_list)  # get the number of files in the folder

    data = np.zeros([files_num, height, width, 1], dtype=np.float32)

    i = 0
    for text_file in text_files_list:
        content = text_read(text_file, height, width, separator)
        data[i] = content
        i = i + 1

    return data
    def get_train_corpus_stats(self,paths):



         paths=utils.get_files_list(paths)
         for file in paths:
             self.parse_ddi_corpus(file)

         unigram_stats=self.get_hash_stats(self.interaction_true_word_count,self.interaction_false_word_count)
         bigram_stats=self.get_hash_stats(self.interaction_true_bigram_count,self.interaction_false_bigram_count)
         trigram_stats=self.get_hash_stats(self.interaction_true_trigram_count,self.interaction_false_trigram_count)

         pickle.dump(unigram_stats,open("models/train_unigram_stats.p",'wb'))
         pickle.dump(bigram_stats,open("models/train_bigram_stats.p",'wb'))
         pickle.dump(trigram_stats,open("models/train_trigram_stats.p",'wb'))
         pickle.dump(self.sentence_interaction_information,open("models/sentence_stats.p",'wb'))

         corpus_stats={}
         corpus_stats["positive_sentences"]=self.positive_sentence_count
         corpus_stats["negative_sentences"]=self.negative_sentence_count
         corpus_stats["total_sentences"]=self.total_sentence_count

         pickle.dump(corpus_stats,open("models/train_corpus_stats.p",'wb'))
#################################################

for folder in target_directory.values():
    mkdir_if_missing(folder)

video_list = {}
file_names = get_frame_sequences(source_directory['apex'][:-5],
                                 class_folders=view_list)
for view in view_list:
    video_list[view] = list(file_names[view].keys())

# for each view...
for view in view_list:
    # get a list of all files in the relevant view directory (file list)
    file_list = get_files_list(source_directory[view])
    print('Running view:', view)

    # run through each video...
    for video in tqdm(video_list[view]):

        # create a temporary list with only the relevant video frame names
        video_frame_list = []
        for file in file_list:
            if file == '.DS_Store': continue
            file_name = re.match(r".+(?=_\d+\.jpg)", file).group()
            if file_name == video:
                video_frame_list.append(file)
        video_frame_list = sorted(
            video_frame_list,
            key=lambda x: int(re.search(r'(?<=_)[\d]+', x).group()))
Пример #24
0
"""
OCR with textract
http://textract.readthedocs.io/en/stable/
"""
from utils import get_files_list, write_file, set_encoding, local_config
import textract

set_encoding()

SOURCE_PATH = local_config('original_files_path')
OUTPUT_PATH = local_config('processed_files_path')
files = get_files_list(SOURCE_PATH)

for file_name in files:
    if file_name.split('.')[1] == 'pdf':
        print("[PDF]File: %s" % file_name)
        text = textract.process(SOURCE_PATH + file_name,
                                method='tesseract',
                                language='eng')
    elif file_name.split('.')[1] == 'png':
        print("[PNG]File: %s" % file_name)
        text = textract.process(SOURCE_PATH + file_name, language='eng')
    else:
        print("File: %s" % file_name)
        text = textract.process(SOURCE_PATH + file_name)
    print text
    write_file(OUTPUT_PATH + '[TEXTRACT]_' + file_name.split('.')[0] + '.txt',
               text)
Пример #25
0
    db_prep_handler = DBPrepHandler()
    for i in range(1, 3):
        prep_worker = Worker(f'Prep Worker{i}', prep_queue)
        prep_worker.register_handler(default_prep_handler)
        prep_worker.register_handler(db_prep_handler)
        prep_worker.register_result_queue(db_queue)
        pool.apply_async(prep_worker.process)

    orm_handler = ORMHandler()
    db_handler = DBHandler()
    for i in range(1, 5):
        db_worker = Worker(f'DB Worker{i}', db_queue)
        db_worker.register_handler(orm_handler)
        db_worker.register_handler(db_handler)
        pool.apply_async(db_worker.process)

    pool.close()

    # Read files name from data directory and put an item per file
    for path in get_files_list(Path('data')):
        file_info = file_model_map.get(path.name)
        if file_info is not None:
            model, type, batch_size = file_info
            item = QueueItem(type, {'path': path}, {
                'model': model,
                'batch_size': batch_size
            })
            file_queue.put(item)

    pool.join()
Пример #26
0
                self.pairs_collection[pair_id]=ddi

                entity_1=self.entity_collection[pair_attrs["e1"]]["text"]
                entity_2=self.entity_collection[pair_attrs["e2"]]["text"]

                if entity_1 in self.interaction_collection.keys():
                    if entity_2 not in self.interaction_collection[entity_1].keys():
                        self.interaction_collection[entity_1][entity_2]=ddi
                elif entity_2 in self.interaction_collection.keys():
                    if entity_1 not in self.interaction_collection[entity_2]:
                        self.interaction_collection[entity_2][entity_1]=ddi
                else:
                    self.interaction_collection[entity_2]={}
                    self.interaction_collection[entity_2][entity_1]=ddi
                    self.interaction_collection[entity_2][entity_1]=ddi


        document_data.close()
test_medline_path= "./Test/Test for DDI Extraction task/MedLine"
test_drugbank_path= "./Test/Test for DDI Extraction task/DrugBank"
paths=[test_medline_path,test_drugbank_path]
paths=utils.get_files_list(paths)
test_corpus_instance=test_corpus()
for file in paths:

    test_corpus_instance.parse_ddi_corpus(file)

    pickle.dump(test_corpus_instance.entity_collection,open("./models/test_entity_collection.p","wb"))
    pickle.dump(test_corpus_instance.interaction_collection,open("./models/test_interaction_collection.p","wb"))
    pickle.dump(test_corpus_instance.pairs_collection,open("./models/test_pairs_collection.p","wb"))
pprint.pprint(test_corpus_instance.pairs_collection)