def read_urls_dict(filename): urls_dict = TwoWayDict() file = open(filename, 'r') for line in file.readlines(): data = line.replace('\n', '').split('\t') if len(data) == 0: continue doc_id = int(data[0]) doc_url = data[1] urls_dict.add(doc_id, doc_url) return urls_dict
def get_queries(filename): file = open(filename, 'r') queries = TwoWayDict() for line in file.readlines(): data = line.replace('\n', '').split('\t') if len(data) == 0: continue qid = int(data[0]) query = data[1] queries.add(qid, query) return queries
def create_vocab_single(self): words = [ w for q_id in self.lemmatized_queries.keys() for w in self.lemmatized_queries[q_id] ] self.vocab = TwoWayDict() idx = 0 for w in words: if self.vocab.dict.get(w, None) is not None: continue self.vocab.add(w, idx) idx += 1
def __init__(self, base_pairs, invert_cond, name_pairs=None, his_him=True): # logging.info("Loading spaCy model...") # self.nlp = spacy.load(spacy_model) # logging.info("Done.") # This flag tells it whether or not to apply the special case intervention to him/his/her/hers self.his_him = his_him self.invert_cond = invert_cond self.base_pairs = TwoWayDict() for (male, female) in base_pairs: self.base_pairs[male.lower()] = female.lower() self.name_pairs = TwoWayDict() for (male, female) in name_pairs: self.name_pairs[male.lower()] = female.lower()
def __init__(self, Job_List, Task_List, Slot_Number, Data_Partition, Job_Task_Dict): self.job = TwoWayDict() self.task = TwoWayDict() self.datacenter = TwoWayDict() self.slots = dict() # 每个datacenter有几个slot self.data_partition2datacenter = dict() # self.datacenter2data_partition = dict() for i in range(len(Job_List)): self.job[Job_List[i]] = i for i in range(len(Task_List)): self.task[Task_List[i]] = i for i in range(len(Slot_Number['DC'])): self.datacenter[Slot_Number['DC'][i]] = i for i in range(len(Slot_Number['Num of Slots'])): self.slots[i] = Slot_Number['Num of Slots'][i] # 例,0对应2 self.slots[self.datacenter[i]] = Slot_Number['Num of Slots'][ i] # 例,DC1对应2 for i in range(len(Data_Partition['Data Partition'])): data_partition_name = Data_Partition['Data Partition'][i] datacenter_name = Data_Partition['Location'][i] self.data_partition2datacenter[ data_partition_name] = datacenter_name # self.datacenter2data_partition[datacenter_name] = data_partition_name self.job_list = Job_List self.task_list = Task_List self.datacenter_list = list() for i in range(len(Slot_Number['DC'])): self.datacenter_list.append(Slot_Number['DC'][i]) self.data_partition_list = list() for i in range(len(Data_Partition['Data Partition'])): self.data_partition_list.append( Data_Partition['Data Partition'][i]) self.job_task_idx_mapping = {} for i in range(len(Job_List)): job_name = Job_List[i] job_id = self.get_job(job_name) self.job_task_idx_mapping[job_id] = list() for task_name in Job_Task_Dict[job_name]: task_id = self.get_task(task_name) self.job_task_idx_mapping[job_id].append(task_id)
def __init__(self): self.vocab_1 = TwoWayDict() self.vocab_2 = TwoWayDict() self.vocab_phrase = TwoWayDict() self._idx1 = 0 self._idx2 = 0 self._idx_phrase = 0
class Vocab: def __init__(self): self.vocab_1 = TwoWayDict() self.vocab_2 = TwoWayDict() self.vocab_phrase = TwoWayDict() self._idx1 = 0 self._idx2 = 0 self._idx_phrase = 0 def add1(self, tkn: str): if self.vocab_1.dict.get(tkn, None) is None: self.vocab_1.add(tkn, self._idx1) self._idx1 += 1 def add2(self, gram: tuple): if self.vocab_2.dict.get(gram, None) is None: self.vocab_2.add(gram, self._idx2) self._idx2 += 1 def add_phrase(self, phrase: tuple): if self.vocab_phrase.dict.get(phrase, None) is None: self.vocab_phrase.add(phrase, self._idx_phrase) self._idx_phrase += 1
queries_filename = data_folder + 'queries.numerate_review.txt' sample_pred = load_predict(data_folder + 'sample_sabmission.txt') queries, vocab = load_queries(queries_filename) docs_obj = pickle.load(open(data_folder + 'documents.pkl', 'rb')) # get_documents(data_folder) documents = docs_obj.docs for d in documents: d.data_path = data_folder doc_ids_map = TwoWayDict(keys=list(docs_obj.docs_ids.keys()), items=list(range(len( list(docs_obj.docs_ids.keys()) ))) ) queries_ids_map = TwoWayDict(keys=list(queries.keys()), items=list(range(len(queries.keys()))) ) docs_num = len(doc_ids_map) queries_num = len(queries_ids_map) unigrams_num = len(vocab.vocab_1) counts_unigram = pickle.load(open(statistics_folder + 'unigram_counts.pkl', 'rb')) counts_bigram_raw = pickle.load(open(statistics_folder + 'bigram_counts_raw.pkl', 'rb')) counts_bigram_inv = pickle.load(open(statistics_folder + 'bigram_counts_inv.pkl', 'rb'))
class Substitutor: def __init__(self, base_pairs, invert_cond, name_pairs=None, his_him=True): # logging.info("Loading spaCy model...") # self.nlp = spacy.load(spacy_model) # logging.info("Done.") # This flag tells it whether or not to apply the special case intervention to him/his/her/hers self.his_him = his_him self.invert_cond = invert_cond self.base_pairs = TwoWayDict() for (male, female) in base_pairs: self.base_pairs[male.lower()] = female.lower() self.name_pairs = TwoWayDict() for (male, female) in name_pairs: self.name_pairs[male.lower()] = female.lower() def probablistic_substitute(self, input_texts): for text in input_texts: if bool(random.getrandbits(1)): yield self.invert_document(text) else: yield text def invert_document(self, input_text): # Parse the doc # doc = self.nlp(input_text) flipped = None # invert sentences 100% of the time if control group (its 50% at the document level) # otherwise, invert sentences 50% of the time if bool(random.getrandbits(1)) or self.invert_cond == "invert_control": for idx, word_pos in enumerate(input_text): if self.invert_cond == "invert_word_names": flipped = self.invert_word_names(word_pos) elif self.invert_cond == "invert_word_neutral": flipped = self.invert_word_neutral(word_pos) else: # self.invert_cond == "invert_control" or "invert_race" flipped = self.invert_word(word_pos) if flipped is not None: input_text[idx][0] = flipped # # Walk through in reverse order making substitutions # for word in reversed(doc): # # # Calculate inversion # flipped = self.invert_word_neutral(word) # invert_word(word) # # if flipped is not None: # # Splice it into output # start_index = word.idx # end_index = start_index + len(word.text) # output = output[:start_index] + flipped + output[end_index:] return input_text def invert_word(self, word_pos): flipped = None word, pos = word_pos[0], word_pos[1] text = word.lower() # Handle base case if text in self.base_pairs.keys(): flipped = self.base_pairs[text] # Handle name case elif text in self.name_pairs.keys(): flipped = self.name_pairs[text] # Handle special case (his/his/her/hers) elif self.his_him: if text == "him": flipped = "her" elif text == "his": if pos == "NNS": flipped = "hers" else: # PRP/PRP$ flipped = "her" elif text == "her": if pos == "PRP$": flipped = "his" else: # PRP flipped = "him" elif text == "hers": flipped = "his" if flipped is not None: # Attempt to approximate case-matching return self.match_case(flipped, word) return None def invert_word_neutral(self, word_pos): # invert_word_neutral_time = time.now() flipped = None word, pos = word_pos[0], word_pos[1] text = word.lower() # handle he/she case if text == "he" or text == "she": flipped = "they" # Handle base case elif text in self.base_pairs.keys(): flipped = self.base_pairs[text] # Handle name case elif text in self.name_pairs.keys(): flipped = self.name_pairs[text] # Handle special case (his/his/her/hers) elif self.his_him: if text == "him": flipped = "them" elif text == "his": if pos == "NNS": flipped = "theirs" else: # PRP$ (can't be PRP ??) flipped = "their" elif text == "her": if pos == "PRP$": flipped = "their" else: # PRP flipped = "them" elif text == "hers": flipped = "theirs" # print("invert_word time " + str(invert_word_neutral_time - time.now())) if flipped is not None: # Attempt to approximate case-matching return self.match_case(flipped, word) return None def invert_word_names(self, word_pos): flipped = None word, pos = word_pos[0], word_pos[1] text = word.lower() # handle he/she case if text == "he" or text == "she": flipped = "they" # Handle base case elif text in self.base_pairs.keys(): flipped = self.base_pairs[text] # Handle name case elif text in self.name_pairs.keys(): flipped = "NAME-PLACEHOLDER" # Handle special case (his/his/her/hers) elif self.his_him: if text == "him": flipped = "them" elif text == "his": if pos == "NNS": flipped = "theirs" else: # PRP$ (can't be PRP ??) flipped = "their" elif text == "her": if pos == "PRP$": flipped = "their" else: # PRP flipped = "them" elif text == "hers": flipped = "theirs" if flipped is not None: # Attempt to approximate case-matching return self.match_case(flipped, word) return None @staticmethod def match_case(input_string, target_string): # Matches the case of a target string to an input string # This is a very naive approach, but for most purposes it should be okay. if target_string.islower(): return input_string.lower() elif target_string.isupper(): return input_string.upper() elif target_string[0].isupper() and target_string[1:].islower(): return input_string[0].upper() + input_string[1:].lower() else: # logging.warning("Unable to match case of {}".format(target_string)) return input_string
class Queries: def __init__(self, ngram_range=None): self.queries = None self.lemmatized_queries = None self.ngram_range = ngram_range def lemmatize(self, stop_words=None): tokenizer = Tokenizer(stop_words=stop_words) lemmatizer = Lemmatizer(stop_words=stop_words) self.lemmatized_queries = dict() for q_id in self.queries.dict.keys(): q = self.queries.get(q_id) tok_q = tokenizer.fit_transform(q) lem_q = lemmatizer.fit_transform(tok_q) self.lemmatized_queries[int(q_id)] = lem_q def get_ngrams(self, tokens, ngram): grams = [ tuple(tokens[i:i + ngram]) for i in range(len(tokens) - ngram + 1) ] return grams def create_vocab_single(self): words = [ w for q_id in self.lemmatized_queries.keys() for w in self.lemmatized_queries[q_id] ] self.vocab = TwoWayDict() idx = 0 for w in words: if self.vocab.dict.get(w, None) is not None: continue self.vocab.add(w, idx) idx += 1 def create_vocab(self): self.create_vocab_single() # words = [] # for ngram in range(self.ngram_range[0], self.ngram_range[1]): # words += [w for q_id in self.lemmatized_queries.keys() for w in self.get_ngrams(self.lemmatized_queries[q_id], # ngram)] # self.vocab = TwoWayDict() # idx = 0 # for w in words: # if self.vocab.dict.get(w, None) is not None: # continue # self.vocab.add(w, idx) # idx += 1 def load(self, filename): self.queries = get_queries(filename) def get_token_ids(self): res = {} for q_id in self.lemmatized_queries.keys(): q = self.lemmatized_queries[q_id] q_tok_ids = [] if self.ngram_range is not None: for ngram in range(self.ngram_range[0], self.ngram_range[1]): try: q_tok_ids += [ self.vocab.get(w) for w in self.get_ngrams(q, ngram) ] except: print('error: ' + q) else: q_tok_ids = [self.vocab.get(w) for w in q] res[q_id] = q_tok_ids return res