def run_em(self): sum_probs = defaultdict(lambda: 1.0) for i in range(10): print "iteration ", i for sentence in self.sentences: if(sentence.strip() == ""): continue parsing_algo = ParsingAlgo(sentence, self.dep_multinomial_holder.mult_list, self.stop_multinomial_holder.mult_list) marginals = parsing_algo.get_marginals() sum_probs[i] += math.log(parsing_algo.total_potentials) edges = parsing_algo.hypergraph.edges self.update_counts(marginals, edges) if(sum_probs[i-1]!=1.0): assert sum_probs[i] > sum_probs[i-1], \ "The prob are %r, %r"% (sum_probs[i], sum_probs[i-1]) self.update_parameters() self.validate_multinomials(self.dep_multinomial_holder) self.validate_multinomials(self.stop_multinomial_holder) pickle_hand = PickleHandler(self.final_value_path) pickle_hand.write_to_pickle(self.dep_multinomial_holder.\ mult_list, self.stop_multinomial_holder.mult_list) pprint.pprint(sum_probs)
def __init__(self, base_url: str, threads_num: Optional[int] = 999, target_formats: Optional[str] = None) -> None: self.base_url = base_url self.threads_num = threads_num self.target_formats = target_formats.split( ',') if target_formats else target_formats self.pre_download_dict = dict() self.history_handler = PickleHandler('wallpaper.history') self.history_urls = self.history_handler.load() self.download_folder = './4chan_thread_download_folder' self.lock = threading.Lock()
def __init__(self, corpus_path, initial_values_path, final_value_path,debug_mode): self.sentences = self.get_sentences(corpus_path) self.debug_mode = debug_mode self.final_value_path = final_value_path self.pickle_handler = PickleHandler(initial_values_path) dep_mult_list, stop_mult_list =\ self.pickle_handler.init_all_dicts() self.stop_multinomial_holder = MultinomialHolder() self.stop_multinomial_holder.mult_list = stop_mult_list self.dep_multinomial_holder = MultinomialHolder() self.dep_multinomial_holder.mult_list = dep_mult_list
self.root_val_file_name = root_val_file_name self.dep_creator = DepCreator() self.stop_cont_creator = ContStopCreator() np.seterr(divide='ignore', invalid='ignore') def sentences(self): sentences = [] with open(self.harmonic_file_name,"r") as fp: sentences += fp.readlines() with open(self.root_val_file_name,"r") as fp: sentences += fp.readlines() return sentences def initialize_harmonic_values(self): sentences = self.sentences() for sent in sentences: if "attach" in sent: self.dep_creator.add_entry(sent) if "continue" in sent: self.stop_cont_creator.add_entry(sent) if "stop" in sent: self.stop_cont_creator.add_entry(sent) if "root" in sent: self.dep_creator.add_entry(sent) if __name__ == "__main__": initializer = HarmonicInitializer("data/harmonic", "data/root_val_file.txt") initializer.initialize_harmonic_values() pickle_handler = PickleHandler("data/harmonic_values_numpy") pickle_handler.write_to_pickle(initializer.dep_creator.prob_attach, initializer.stop_cont_creator.prob_cont, "data/harmonic_values_numpy")
return self.stop_cont_mult_holder[arc.head_word, arc.dir, arc.is_adj].prob[self.stop] # When the tuple does not have any values, #it means trap to constit else: return 1 def display(self): self.get_hypergraph() marginals = self.get_marginals() for edge in self.hypergraph.edges: print edge.label, marginals[edge.id], self.potentials[edge] for node in self.hypergraph.nodes: print node.label #self.c.show() if __name__ == "__main__": sentence = "NN VBZ" #NN VBZ RB VBN VBN" pickle_handler = PickleHandler("data/dummy") dep_mult, stop_cont_mult = pickle_handler.init_all_dicts() parsing = ParsingAlgo(sentence, dep_mult, stop_cont_mult) parsing.get_hypergraph() parsing.display() print sentence depen = parsing.best_edges() pprint.pprint(depen)
import os from pickle_handler import PickleHandler from sklearn_wrapper_extractor import SklearnWrapperExtractor from dataframe_manipulator import DataframeManipulator if __name__ == '__main__': in_directory = r'E:\Corpora\PII_Directory_20190507' in_pickled_df = os.path.join(in_directory, 'pickled_lines_transformed_001.pkl') out_pickled_df = None pickle_handler = PickleHandler(in_pickled_df, out_pickled_df) dfm = DataframeManipulator(pickle_handler.df) labels = [ 'PII|Health|condition_treatment', 'PII|Health|health_payment', 'PII|Health|applications_and_claims', 'PII|Employment|performance_review' ] pickle_handler.df = dfm.equalize_rows_by_label(labels) #Plot df_plot = pickle_handler.df.groupby( ['minimum_label'], as_index=False).count()[['minimum_label', 'tag']] df_plot['label'] = df_plot.apply( lambda x: x['minimum_label'].split('|')[2], axis=1) df_plot.columns = ['minimum_label', 'count', 'label'] plt = df_plot.plot.barh(x='label', y='count') sklearn_wrapper_extractor = SklearnWrapperExtractor(pickle_handler.df) sklearn_wrapper_extractor.prepare_data()
import pandas as pd import os from pickle_handler import PickleHandler if __name__ == '__main__': in_directory = r'E:\Corpora\PII_Directory_20190507' in_pickled_df = os.path.join(in_directory, 'pickled_paragraphs.pkl') out_pickled_df = os.path.join(in_directory, 'pickled_paragraphs_transformed_001.pkl') pickle_handler = PickleHandler(in_pickled_df, out_pickled_df) pickle_handler.perform_nlp() #pickle_handler.set_category_ids() pickle_handler.save_pickle() print('DONE')
import os from pickle_handler import PickleHandler from sklearn_wrapper_extractor import SklearnWrapperExtractor if __name__ == '__main__': in_directory = r'E:\Corpora\PII_Directory_20190507' in_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_ids_001.pkl') out_pickled_df = None pickle_handler = PickleHandler(in_pickled_df, out_pickled_df) sklearn_wrapper_extractor = SklearnWrapperExtractor(pickle_handler.df) sklearn_wrapper_extractor.prepare_data() sklearn_wrapper_extractor.vectorize_documents_tokenized() #sklearn_wrapper_extractor.get_features_with_chi2() sklearn_wrapper_extractor.get_features_with_model() #sklearn_wrapper_extractor.save_features()
# import pickle # to load pickle data import numpy as np # for matrix multiplication from pickle_handler import PickleHandler import tensorflow as tf # main tf class for data related ops import os # for os related tasks # preprocess got data using pickle handler class gotData = PickleHandler("./got.pkl") # lenght of sequence to consider for training len_seq = 100 # total examples per epoch to target examples_per_seq = len(gotData.text2num) // len_seq # make a data set using our model char_dataset = tf.data.Dataset.from_tensor_slices(gotData.text2num) # make a sequence of text datset sequence = char_dataset.batch(len_seq + 1, drop_remainder=True) # split data set into two parts def split_input_output(chunk): input_text = chunk[:-1] # input of the text output_text = chunk[1:] # output of the text return input_text, output_text # create a dateset mapping to split data accordingly
# load tensorflow for deep learning import tensorflow as tf import numpy as np # for matrix multiplication from pickle_handler import PickleHandler # handle pickle data from sys import argv from tqdm import tqdm filename, modelfile, outputfile = argv # load gotdata gotData = PickleHandler("./got.pkl") def loss(labels, logits): return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) # define a new model model = tf.keras.Sequential([ # a embedding layer tf.keras.layers.Embedding(gotData.vocab_size(), 256, batch_input_shape=[1, None]), # lstm layer tf.keras.layers.LSTM( 512, stateful=True, return_sequences=True, recurrent_initializer="glorot_uniform",
import pandas as pd import os from pickle_handler import PickleHandler if __name__ == '__main__': in_directory = r'E:\Corpora\PII_Directory_20190507' in_pickled_df = os.path.join(in_directory, 'df_pickle_001.pkl') out_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_001.pkl') pickle_handler = PickleHandler(in_pickled_df, out_pickled_df) pickle_handler.perform_nlp() pickle_handler.set_category_ids() pickle_handler.save_pickle()
class Parser: def __init__(self, corpus_path, initial_values_path, final_value_path,debug_mode): self.sentences = self.get_sentences(corpus_path) self.debug_mode = debug_mode self.final_value_path = final_value_path self.pickle_handler = PickleHandler(initial_values_path) dep_mult_list, stop_mult_list =\ self.pickle_handler.init_all_dicts() self.stop_multinomial_holder = MultinomialHolder() self.stop_multinomial_holder.mult_list = stop_mult_list self.dep_multinomial_holder = MultinomialHolder() self.dep_multinomial_holder.mult_list = dep_mult_list def run_em(self): sum_probs = defaultdict(lambda: 1.0) for i in range(10): print "iteration ", i for sentence in self.sentences: if(sentence.strip() == ""): continue parsing_algo = ParsingAlgo(sentence, self.dep_multinomial_holder.mult_list, self.stop_multinomial_holder.mult_list) marginals = parsing_algo.get_marginals() sum_probs[i] += math.log(parsing_algo.total_potentials) edges = parsing_algo.hypergraph.edges self.update_counts(marginals, edges) if(sum_probs[i-1]!=1.0): assert sum_probs[i] > sum_probs[i-1], \ "The prob are %r, %r"% (sum_probs[i], sum_probs[i-1]) self.update_parameters() self.validate_multinomials(self.dep_multinomial_holder) self.validate_multinomials(self.stop_multinomial_holder) pickle_hand = PickleHandler(self.final_value_path) pickle_hand.write_to_pickle(self.dep_multinomial_holder.\ mult_list, self.stop_multinomial_holder.mult_list) pprint.pprint(sum_probs) def update_counts(self, marginals, edges): for edge in edges: arc = edge.label if arc.is_cont and arc.modifier_word != "": self.stop_multinomial_holder.inc_counts(arc.is_cont, (arc.head_word, arc.dir, arc.is_adj), marginals[edge.id]) self.dep_multinomial_holder.inc_counts(arc.\ modifier_word,(arc.head_word, arc.dir), marginals[edge.id]) if not arc.is_cont: self.stop_multinomial_holder.\ inc_counts(arc.is_cont, (arc.head_word, arc.dir, arc.is_adj), marginals[edge.id]) def update_parameters(self): self.dep_multinomial_holder.estimate() self.stop_multinomial_holder.estimate() def get_sentences(self, file_path): sentences = [] with open(file_path,"r") as fp: sentences = fp.readlines() return sentences def validate_multinomials(self, multinomial_holder): for key, mult in multinomial_holder.mult_list.iteritems(): if(self.debug_mode): print key pprint.pprint(mult.prob) total = sum(mult.prob.values()) assert round(total, 1) == 1.0 or round(total, 1) == 0 ,\ "The mult for " + str(key) + " is not totalling to 1 "\ + str(total)
from pickle_handler import PickleHandler # to handle pickle files import tensorflow as tf # for deep learning libs import os # for os related work # load data set to work gotData = PickleHandler("./got.pkl") # to set seq lenght for the learning len_seq = 1000 # total num of examples examples_per_seq = len(gotData.text2num) // len_seq # to make a char dataset char_dataset = tf.data.Dataset.from_tensor_slices(gotData.text2num) # design a sequence of tensor from this char data sequences = char_dataset.batch(len_seq + 1, drop_remainder=True) def split_input_output(chunk): input_text = chunk[:-1] # input of the text output_text = chunk[1:] # output of the text return input_text, output_text # prepare dataset of these sequences from this split function dataset = sequences.map(split_input_output) # define some dataset params
def initialize_dep(self): dep_mult_holder = MultinomialHolder() for cond_key, mult in self.harmonic_dep_mult.iteritems(): for prob_key in mult.prob: dep_mult_holder.\ inc_counts(prob_key, cond_key, random.random()) dep_mult_holder.estimate() return dep_mult_holder def initialize_stop_mult_cont(self): stop_cont_mult_holder = MultinomialHolder() for cond_key, mult in self.harmonic_stop_cont_mult.iteritems(): random_value = random.random() stop_cont_mult_holder.\ inc_counts(0, cond_key,random_value) stop_cont_mult_holder.\ inc_counts(1, cond_key,1 - random_value) stop_cont_mult_holder.estimate() return stop_cont_mult_holder if __name__ == "__main__": pickle_handler = PickleHandler("data/dummy") dep_mult, stop_cont_mult = pickle_handler.init_all_dicts() random_init = RandomInitializer(dep_mult, stop_cont_mult) random_init.initialize_multinomials() pickle_handler = PickleHandler("data/random_init") pickle_handler.write_to_pickle(random_init.dep_mult_holder.\ mult_list, random_init.stop_cont_mult_holder.mult_list)
if(sent_acc <= 2 and len(actual_dep) >= 2): self.incorrect_sent.append(sentence) self.incorrect_dep[incorrect_dep_key] += 1 def get_sentences(self, file_path): sentences = [] with open(file_path,"r") as fp: sentences = fp.readlines() return sentences def write_to_file(self, file_name, data): with open(file_name, "wb") as fp: fp.writelines(("%s\n" % line for line in data)) if __name__ == "__main__": pickle_handler = PickleHandler("data/harmonic_final") dep_mult_holder, cont_stop_mult_holder =\ pickle_handler.init_all_dicts() evaluator = Evaluator("data/sentences_train.txt", "data/dep_index_train.txt", dep_mult_holder, cont_stop_mult_holder) evaluator.evaluate_sentences() evaluator.write_to_file("incorrect_sent_rule", evaluator.incorrect_sent) with open("incorrect_dep_dict_new", "wb") as fp: pickle.dump(evaluator.incorrect_dep, fp)
import pandas as pd import os from pickle_handler import PickleHandler if __name__ == '__main__': in_directory = r'E:\Corpora\PII_Directory_20190507' in_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_001.pkl') out_pickled_df = os.path.join(in_directory, 'df_pickle_transformed_ids_001.pkl') pickle_handler = PickleHandler(in_pickled_df, out_pickled_df) pickle_handler.set_category_ids()
from pickle_handler import PickleHandler N_best_features = 10 ph = PickleHandler(in_pickled_df=r'E:\Corpora\PII_Directory_20190507\sklearn_features.pkl') best_features_dict = ph.df targets = [target for target in best_features_dict.keys()] print(targets) features = best_features_dict['PII|Tax|itin_tax_id'][0][-N_best_features:] print(features) a = 1
sentences += fp.readlines() with open(self.root_val_file_name,"r") as fp: sentences += fp.readlines() return sentences def create_dict(self): sentences = self.sentences() for sent in sentences: if "attach" in sent: self.dep_creator.add_entry(sent) if "continue" in sent: self.stop_cont_creator.add_entry(sent) if "stop" in sent: self.stop_cont_creator.add_entry(sent) if "root" in sent: self.dep_creator.add_entry(sent) self.dep_creator.mult_holder.estimate() self.stop_cont_creator.mult_holder.estimate() if __name__ == "__main__": initializer = InitDict("data/harmonic", "data/root_val_file.txt") initializer.create_dict() pickle_handler = PickleHandler("data/harmonic_values_mult") dep_mult_list = initializer.dep_creator.mult_holder.mult_list stop_cont_mult_list = initializer.stop_cont_creator.\ mult_holder.mult_list pickle_handler.write_to_pickle(dep_mult_list, stop_cont_mult_list)
class ThreadsDownloader4chan: HEADER = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36", "cookie": "__cfduid=d903e3abeaca2effe91e7b839a96be7211527491373; _ga=GA1.3.1213173136.1527491373; _ga=GA1.2.2716196.1533521826; _gid=GA1.2.2067292582.1537358233; _gid=GA1.3.2067292582.1537358233; Hm_lvt_ba7c84ce230944c13900faeba642b2b4=1537359428,1537361149,1537362700,1537363469; Hm_lpvt_ba7c84ce230944c13900faeba642b2b4=1537363858" } PROXIES = { "http": "socks5://127.0.0.1:10808", 'https': 'socks5://127.0.0.1:10808' } def __init__(self, base_url: str, threads_num: Optional[int] = 999, target_formats: Optional[str] = None) -> None: self.base_url = base_url self.threads_num = threads_num self.target_formats = target_formats.split( ',') if target_formats else target_formats self.pre_download_dict = dict() self.history_handler = PickleHandler('wallpaper.history') self.history_urls = self.history_handler.load() self.download_folder = './4chan_thread_download_folder' self.lock = threading.Lock() def set_total(self, n): self.counter = 0 self.total = n def get_process(self): with self.lock: self.counter += 1 return f"[{self.counter}/{self.total}]" @retry(wait_exponential_multiplier=1000, wait_exponential_max=60000) def request_get_with_retry(self, url, **args): default_args = { 'headers': ThreadsDownloader4chan.HEADER, 'proxies': ThreadsDownloader4chan.PROXIES } default_args.update(args) default_args = {i[0]: i[1] for i in default_args.items() if i[1]} return requests.get(url, **default_args) def history_filter(self): self.pre_download_list = [] for thread_name, imgs_f_name in self.pre_download_dict.items(): temp_imgs_f_name = [] for img_f_name in imgs_f_name: if img_f_name[0] in self.history_urls: continue elif (self.target_formats and img_f_name[0].lower().split('.')[-1] not in self.target_formats): continue else: temp_imgs_f_name.append(img_f_name) self.history_urls.add(img_f_name[0]) if temp_imgs_f_name: for temp_img_f_name in temp_imgs_f_name: self.pre_download_list.append( [thread_name, temp_img_f_name[0], temp_img_f_name[1]]) self.history_handler.dump(self.history_urls) @property def url_catalog2thread(self): if self.base_url[-1] != '/': url = self.base_url + '/' else: url = self.base_url return url.replace('catalog', 'thread') def get_all_thread(self, base_url): r = self.request_get_with_retry(base_url) data = json.loads( re.findall(r"var catalog =(.*?);var style_group =", r.text)[0]) threads_url = list(data['threads'].keys()) self.threads_url = [self.url_catalog2thread + i for i in threads_url] self.threads_url = self.threads_url[:self.threads_num] def parse_thread_get_img_url(self, thread_url): try: r = self.request_get_with_retry(thread_url) thread_name = (re.findall( r'<title>(.*?)</title>', r.text)[0].replace('|', '').replace('?', '').replace( '*', '').replace('#', '').replace('\\', '').replace( '<', '').replace('>', '').replace(':', '').replace( '“', '').replace('/', '').split('-')[1].strip()) print(self.get_process(), 'Parsed thread:', thread_name) html = etree.HTML(r.content) imgs = html.xpath(".//a[@class='fileThumb']/@href") imgs = ['https:' + i for i in imgs] f_name = html.xpath(".//div[@class='fileText']/a/text()") imgs_f_name = list(zip(imgs, f_name)) self.pre_download_dict.update({thread_name: imgs_f_name}) except: print(traceback.format_exc()) def run(self): self.get_all_thread(self.base_url) self.set_total(len(self.threads_url)) with ThreadPoolExecutor(8) as executor: executor.map(self.parse_thread_get_img_url, self.threads_url) self.history_filter() self.set_total(len(self.pre_download_list)) with ThreadPoolExecutor(16) as executor: executor.map(self.downloader, self.pre_download_list) def downloader(self, items): try: print(self.get_process(), 'Downloading:', items) thread_name, img, f_name = items download_folder = os.path.join(self.download_folder, thread_name) with self.lock: if not os.path.exists(download_folder): os.makedirs(download_folder) content = self.request_get_with_retry(img).content with open(os.path.join(download_folder, f_name), 'wb') as f: f.write(content) except: print(traceback.format_exc())
# load tensorflow for deep learning import tensorflow as tf import numpy as np # for matrix multiplication from pickle_handler import PickleHandler # handle pickle data from sys import argv from tqdm import tqdm filename, modelfile, outputfile = argv # load gotdata gotData = PickleHandler("./got.pkl") def loss(labels, logits): return tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True ) # define a new model model = tf.keras.Sequential( [ # a embedding layer tf.keras.layers.Embedding( gotData.vocab_size(), 256, batch_input_shape=[1, None] ), # lstm layer # bidirectional rnn layer tf.keras.layers.Bidirectional( tf.keras.layers.SimpleRNN( 1024,
from pickle_handler import PickleHandler N_best_features = 10 ph = PickleHandler( in_pickled_df=r'E:\Corpora\PII_Directory_20190507\features_20190528.pkl') best_features_dict = ph.df a = 1