Exemplo n.º 1
0
def read_urls_dict(filename):
    urls_dict = TwoWayDict()

    file = open(filename, 'r')
    for line in file.readlines():
        data = line.replace('\n', '').split('\t')
        if len(data) == 0:
            continue
        doc_id = int(data[0])
        doc_url = data[1]
        urls_dict.add(doc_id, doc_url)
    return urls_dict
Exemplo n.º 2
0
def get_queries(filename):
    file = open(filename, 'r')
    queries = TwoWayDict()
    for line in file.readlines():
        data = line.replace('\n', '').split('\t')
        if len(data) == 0:
            continue
        qid = int(data[0])
        query = data[1]
        queries.add(qid, query)

    return queries
Exemplo n.º 3
0
    def create_vocab_single(self):
        words = [
            w for q_id in self.lemmatized_queries.keys()
            for w in self.lemmatized_queries[q_id]
        ]
        self.vocab = TwoWayDict()
        idx = 0
        for w in words:
            if self.vocab.dict.get(w, None) is not None:
                continue

            self.vocab.add(w, idx)
            idx += 1
Exemplo n.º 4
0
    def __init__(self, base_pairs, invert_cond, name_pairs=None, his_him=True):

        # logging.info("Loading spaCy model...")
        # self.nlp = spacy.load(spacy_model)
        # logging.info("Done.")

        # This flag tells it whether or not to apply the special case intervention to him/his/her/hers
        self.his_him = his_him
        self.invert_cond = invert_cond

        self.base_pairs = TwoWayDict()
        for (male, female) in base_pairs:
            self.base_pairs[male.lower()] = female.lower()

        self.name_pairs = TwoWayDict()
        for (male, female) in name_pairs:
            self.name_pairs[male.lower()] = female.lower()
Exemplo n.º 5
0
    def __init__(self, Job_List, Task_List, Slot_Number, Data_Partition,
                 Job_Task_Dict):
        self.job = TwoWayDict()
        self.task = TwoWayDict()
        self.datacenter = TwoWayDict()
        self.slots = dict()  # 每个datacenter有几个slot
        self.data_partition2datacenter = dict()
        # self.datacenter2data_partition = dict()
        for i in range(len(Job_List)):
            self.job[Job_List[i]] = i
        for i in range(len(Task_List)):
            self.task[Task_List[i]] = i
        for i in range(len(Slot_Number['DC'])):
            self.datacenter[Slot_Number['DC'][i]] = i
        for i in range(len(Slot_Number['Num of Slots'])):
            self.slots[i] = Slot_Number['Num of Slots'][i]  # 例,0对应2
            self.slots[self.datacenter[i]] = Slot_Number['Num of Slots'][
                i]  # 例,DC1对应2
        for i in range(len(Data_Partition['Data Partition'])):
            data_partition_name = Data_Partition['Data Partition'][i]
            datacenter_name = Data_Partition['Location'][i]
            self.data_partition2datacenter[
                data_partition_name] = datacenter_name
            # self.datacenter2data_partition[datacenter_name] = data_partition_name

        self.job_list = Job_List
        self.task_list = Task_List
        self.datacenter_list = list()
        for i in range(len(Slot_Number['DC'])):
            self.datacenter_list.append(Slot_Number['DC'][i])
        self.data_partition_list = list()
        for i in range(len(Data_Partition['Data Partition'])):
            self.data_partition_list.append(
                Data_Partition['Data Partition'][i])

        self.job_task_idx_mapping = {}
        for i in range(len(Job_List)):
            job_name = Job_List[i]
            job_id = self.get_job(job_name)
            self.job_task_idx_mapping[job_id] = list()
            for task_name in Job_Task_Dict[job_name]:
                task_id = self.get_task(task_name)
                self.job_task_idx_mapping[job_id].append(task_id)
Exemplo n.º 6
0
    def __init__(self):
        self.vocab_1 = TwoWayDict()
        self.vocab_2 = TwoWayDict()
        self.vocab_phrase = TwoWayDict()

        self._idx1 = 0
        self._idx2 = 0
        self._idx_phrase = 0
Exemplo n.º 7
0
class Vocab:
    def __init__(self):
        self.vocab_1 = TwoWayDict()
        self.vocab_2 = TwoWayDict()
        self.vocab_phrase = TwoWayDict()

        self._idx1 = 0
        self._idx2 = 0
        self._idx_phrase = 0

    def add1(self, tkn: str):
        if self.vocab_1.dict.get(tkn, None) is None:
            self.vocab_1.add(tkn, self._idx1)
            self._idx1 += 1

    def add2(self, gram: tuple):
        if self.vocab_2.dict.get(gram, None) is None:
            self.vocab_2.add(gram, self._idx2)
            self._idx2 += 1

    def add_phrase(self, phrase: tuple):
        if self.vocab_phrase.dict.get(phrase, None) is None:
            self.vocab_phrase.add(phrase, self._idx_phrase)
            self._idx_phrase += 1
Exemplo n.º 8
0
    queries_filename = data_folder + 'queries.numerate_review.txt'

    sample_pred = load_predict(data_folder + 'sample_sabmission.txt')

    queries, vocab = load_queries(queries_filename)

    docs_obj = pickle.load(open(data_folder + 'documents.pkl', 'rb'))  # get_documents(data_folder)
    documents = docs_obj.docs
    for d in documents:
        d.data_path = data_folder


    doc_ids_map = TwoWayDict(keys=list(docs_obj.docs_ids.keys()),
                             items=list(range(len(
                                 list(docs_obj.docs_ids.keys())
                             )))
                             )
    queries_ids_map = TwoWayDict(keys=list(queries.keys()),
                                 items=list(range(len(queries.keys())))
                                 )

    docs_num = len(doc_ids_map)
    queries_num = len(queries_ids_map)
    unigrams_num = len(vocab.vocab_1)


    counts_unigram = pickle.load(open(statistics_folder + 'unigram_counts.pkl', 'rb'))

    counts_bigram_raw = pickle.load(open(statistics_folder + 'bigram_counts_raw.pkl', 'rb'))
    counts_bigram_inv = pickle.load(open(statistics_folder + 'bigram_counts_inv.pkl', 'rb'))
Exemplo n.º 9
0
class Substitutor:
    def __init__(self, base_pairs, invert_cond, name_pairs=None, his_him=True):

        # logging.info("Loading spaCy model...")
        # self.nlp = spacy.load(spacy_model)
        # logging.info("Done.")

        # This flag tells it whether or not to apply the special case intervention to him/his/her/hers
        self.his_him = his_him
        self.invert_cond = invert_cond

        self.base_pairs = TwoWayDict()
        for (male, female) in base_pairs:
            self.base_pairs[male.lower()] = female.lower()

        self.name_pairs = TwoWayDict()
        for (male, female) in name_pairs:
            self.name_pairs[male.lower()] = female.lower()

    def probablistic_substitute(self, input_texts):
        for text in input_texts:
            if bool(random.getrandbits(1)):
                yield self.invert_document(text)
            else:
                yield text

    def invert_document(self, input_text):
        # Parse the doc
        # doc = self.nlp(input_text)

        flipped = None

        # invert sentences 100% of the time if control group (its 50% at the document level)
        # otherwise, invert sentences 50% of the time
        if bool(random.getrandbits(1)) or self.invert_cond == "invert_control":

            for idx, word_pos in enumerate(input_text):
                if self.invert_cond == "invert_word_names":
                    flipped = self.invert_word_names(word_pos)

                elif self.invert_cond == "invert_word_neutral":
                    flipped = self.invert_word_neutral(word_pos)

                else:  # self.invert_cond == "invert_control" or "invert_race"
                    flipped = self.invert_word(word_pos)

                if flipped is not None:
                    input_text[idx][0] = flipped

        # # Walk through in reverse order making substitutions
        # for word in reversed(doc):
        #
        #     # Calculate inversion
        #     flipped = self.invert_word_neutral(word)  # invert_word(word)
        #
        #     if flipped is not None:
        #         # Splice it into output
        #         start_index = word.idx
        #         end_index = start_index + len(word.text)
        #         output = output[:start_index] + flipped + output[end_index:]

        return input_text

    def invert_word(self, word_pos):

        flipped = None

        word, pos = word_pos[0], word_pos[1]
        text = word.lower()

        # Handle base case
        if text in self.base_pairs.keys():
            flipped = self.base_pairs[text]

        # Handle name case
        elif text in self.name_pairs.keys():
            flipped = self.name_pairs[text]

        # Handle special case (his/his/her/hers)
        elif self.his_him:
            if text == "him":
                flipped = "her"
            elif text == "his":
                if pos == "NNS":
                    flipped = "hers"
                else:  # PRP/PRP$
                    flipped = "her"
            elif text == "her":
                if pos == "PRP$":
                    flipped = "his"
                else:  # PRP
                    flipped = "him"
            elif text == "hers":
                flipped = "his"

        if flipped is not None:
            # Attempt to approximate case-matching
            return self.match_case(flipped, word)
        return None

    def invert_word_neutral(self, word_pos):
        # invert_word_neutral_time = time.now()
        flipped = None
        word, pos = word_pos[0], word_pos[1]
        text = word.lower()

        # handle he/she case
        if text == "he" or text == "she":
            flipped = "they"

        # Handle base case
        elif text in self.base_pairs.keys():
            flipped = self.base_pairs[text]

        # Handle name case
        elif text in self.name_pairs.keys():
            flipped = self.name_pairs[text]

        # Handle special case (his/his/her/hers)
        elif self.his_him:
            if text == "him":
                flipped = "them"
            elif text == "his":
                if pos == "NNS":
                    flipped = "theirs"
                else:  # PRP$ (can't be PRP ??)
                    flipped = "their"

            elif text == "her":
                if pos == "PRP$":
                    flipped = "their"
                else:  # PRP
                    flipped = "them"
            elif text == "hers":
                flipped = "theirs"
        # print("invert_word time " + str(invert_word_neutral_time - time.now()))
        if flipped is not None:
            # Attempt to approximate case-matching
            return self.match_case(flipped, word)
        return None

    def invert_word_names(self, word_pos):

        flipped = None
        word, pos = word_pos[0], word_pos[1]
        text = word.lower()

        # handle he/she case
        if text == "he" or text == "she":
            flipped = "they"

        # Handle base case
        elif text in self.base_pairs.keys():
            flipped = self.base_pairs[text]

        # Handle name case
        elif text in self.name_pairs.keys():
            flipped = "NAME-PLACEHOLDER"

        # Handle special case (his/his/her/hers)
        elif self.his_him:
            if text == "him":
                flipped = "them"
            elif text == "his":
                if pos == "NNS":
                    flipped = "theirs"
                else:  # PRP$ (can't be PRP ??)
                    flipped = "their"

            elif text == "her":
                if pos == "PRP$":
                    flipped = "their"
                else:  # PRP
                    flipped = "them"
            elif text == "hers":
                flipped = "theirs"

        if flipped is not None:
            # Attempt to approximate case-matching
            return self.match_case(flipped, word)
        return None

    @staticmethod
    def match_case(input_string, target_string):
        # Matches the case of a target string to an input string
        # This is a very naive approach, but for most purposes it should be okay.
        if target_string.islower():
            return input_string.lower()
        elif target_string.isupper():
            return input_string.upper()
        elif target_string[0].isupper() and target_string[1:].islower():
            return input_string[0].upper() + input_string[1:].lower()
        else:
            # logging.warning("Unable to match case of {}".format(target_string))
            return input_string
Exemplo n.º 10
0
class Queries:
    def __init__(self, ngram_range=None):
        self.queries = None
        self.lemmatized_queries = None
        self.ngram_range = ngram_range

    def lemmatize(self, stop_words=None):
        tokenizer = Tokenizer(stop_words=stop_words)
        lemmatizer = Lemmatizer(stop_words=stop_words)

        self.lemmatized_queries = dict()
        for q_id in self.queries.dict.keys():
            q = self.queries.get(q_id)

            tok_q = tokenizer.fit_transform(q)
            lem_q = lemmatizer.fit_transform(tok_q)
            self.lemmatized_queries[int(q_id)] = lem_q

    def get_ngrams(self, tokens, ngram):
        grams = [
            tuple(tokens[i:i + ngram]) for i in range(len(tokens) - ngram + 1)
        ]
        return grams

    def create_vocab_single(self):
        words = [
            w for q_id in self.lemmatized_queries.keys()
            for w in self.lemmatized_queries[q_id]
        ]
        self.vocab = TwoWayDict()
        idx = 0
        for w in words:
            if self.vocab.dict.get(w, None) is not None:
                continue

            self.vocab.add(w, idx)
            idx += 1

    def create_vocab(self):
        self.create_vocab_single()

        # words = []
        # for ngram in range(self.ngram_range[0], self.ngram_range[1]):
        #     words += [w for q_id in self.lemmatized_queries.keys() for w in self.get_ngrams(self.lemmatized_queries[q_id],
        #                                                                                ngram)]
        # self.vocab = TwoWayDict()
        # idx = 0
        # for w in words:
        #     if self.vocab.dict.get(w, None) is not None:
        #         continue
        #     self.vocab.add(w, idx)
        #     idx += 1

    def load(self, filename):
        self.queries = get_queries(filename)

    def get_token_ids(self):
        res = {}
        for q_id in self.lemmatized_queries.keys():
            q = self.lemmatized_queries[q_id]
            q_tok_ids = []
            if self.ngram_range is not None:
                for ngram in range(self.ngram_range[0], self.ngram_range[1]):
                    try:
                        q_tok_ids += [
                            self.vocab.get(w)
                            for w in self.get_ngrams(q, ngram)
                        ]
                    except:
                        print('error: ' + q)
            else:
                q_tok_ids = [self.vocab.get(w) for w in q]

            res[q_id] = q_tok_ids
        return res