Пример #1
0
    def read_csv(self):
        """
        Load local data and then generate three important data structures used for smart crawl.
        **localdata_ids** Collect a set of uniqueid. ('uniqueid1', 'uniqueid2')

        **localdata_query** Split the fields into a list of words defined by querylist of each message.
        Filter out stop words and words whose length<3 from the list of words.
        Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']}

        **localdata_er** A list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')]
        """
        with open(self.__localPath, 'rb') as csvfile:
            reader = csv.reader(csvfile)
            data_raw = [row for row in reader]

        uniqueid_index = 0
        querylist_index = []
        matchlist_index = []
        try:
            header = data_raw.pop(0)
            uniqueid_index = header.index(self.__uniqueId)
            for q in self.__queryList:
                querylist_index.append(header.index(q))
            for m in self.__matchList:
                matchlist_index.append(header.index(m))
        except ValueError:
            print >> perr, "Can't find attributes"
            exit(0)

        localdata_query = {}
        localdata_er = []
        localdata_ids = set()
        stop_words = ['and', 'for', 'the', 'with', 'about']
        for row in data_raw:
            try:
                r_id = row[uniqueid_index]
            except IndexError:
                continue
            localdata_ids.add(r_id)

            tempbag = []
            for q in querylist_index:
                try:
                    tempbag.extend(wordset(row[q]))
                except IndexError:
                    continue
            bag = []
            for word in tempbag:
                if word not in stop_words and len(word) >= 3:
                    bag.append(word)
            localdata_query[r_id] = bag

            bag = []
            for m in matchlist_index:
                try:
                    bag.extend(wordset(row[m]))
                except IndexError:
                    continue
            localdata_er.append((bag, r_id))
        self.setlocalData(localdata_ids, localdata_query, localdata_er)
Пример #2
0
    def proResult(self, result_raw):
        """
        Merge the raw data and keep them in a dict. Then, pre-process the raw data for similarity join.

        :param result_raw: the raw result returned by api.
        :return: a list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')]
        :raises KeyError: some messages would miss some fields.
        """
        result_merge = self.__mergeResult
        result_er = []
        for row in result_raw:
            try:
                r_id = eval(self.__uniqueId)
            except KeyError:
                continue
            if r_id not in result_merge:
                result_merge[r_id] = row
                bag = []
                for v in self.__matchList:
                    try:
                        bag.extend(wordset(eval(v)))
                    except KeyError:
                        continue
                result_er.append((bag, r_id))
        self.setMergeResult(result_merge)
        return result_er
Пример #3
0
    def read_pickle(self):
        """
        Load local data and then generate three important data structures used for smart crawl.
        **localdata_ids** Collect a set of uniqueid. ('uniqueid1', 'uniqueid2')

        **localdata_query** Split the fields into a list of words defined by querylist of each message.
        Filter out stop words and words whose length<3 from the list of words.
        Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']}

        **localdata_er** A list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')]
        """
        with open(self.__localPath, 'rb') as f:
            data_raw = pickle.load(f)

        uniqueid = self.__uniqueId.split('.')
        querylist = []
        for q in self.__queryList:
            querylist.append(q.split('.'))
        matchlist = []
        for m in self.__matchList:
            matchlist.append(m.split('.'))

        localdata_record = {}
        localdata_query = {}
        localdata_er = []
        localdata_ids = set()
        stop_words = ['and', 'for', 'the', 'with', 'about']
        for row in data_raw:
            r_id = getElement(uniqueid, row)
            localdata_ids.add(r_id)
            localdata_record[r_id] = row

            tempbag = []
            for q in querylist:
                tempbag.extend(wordset(getElement(q, row)))
            bag = []
            for word in tempbag:
                if word not in stop_words and len(word) >= 3:
                    bag.append(word)
            localdata_query[r_id] = bag

            bag = []
            for m in matchlist:
                bag.extend(wordset(getElement(m, row)))
            localdata_er.append((bag, r_id))
        self.setlocalData(localdata_ids, localdata_query, localdata_er,
                          localdata_record)
Пример #4
0
    def read_pickle(self):
        """
        Load sample data and then generate a same data structures as localdata_query used for smart crawl.

        **sample** Split the fields into a list of words defined by querylist of each message.
        Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']}
        """
        with open(self.__samplePath, 'rb') as f:
            sample_raw = pickle.load(f)

        uniqueid = self.__uniqueId.split('.')
        querylist = []
        for q in self.__queryList:
            querylist.append(q.split('.'))

        sample = {}
        for row in sample_raw:
            r_id = getElement(uniqueid, row)
            bag = []
            for q in querylist:
                bag.extend(wordset(getElement(q, row)))
            sample[r_id] = bag
        self.setSample(sample)
Пример #5
0
    def read_csv(self):
        """
        Load sample data and then generate a same data structures as localdata_query used for smart crawl.

        **sample** Split the fields into a list of words defined by querylist of each message.
        Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']}
        """
        with open(self.__samplePath, 'rb') as csvfile:
            reader = csv.reader(csvfile)
            sample_raw = [row for row in reader]

        uniqueid_index = 0
        querylist_index = []
        try:
            header = sample_raw.pop(0)
            header[0] = header[0].replace(b'\xef\xbb\xbf', '')
            uniqueid_index = header.index(self.__uniqueId)
            for q in self.__queryList:
                querylist_index.append(header.index(q))
        except ValueError:
            print >> perr, "Can't find attributes"
            exit(0)

        sample = {}
        for row in sample_raw:
            try:
                r_id = row[uniqueid_index]
            except IndexError:
                continue
            bag = []
            for q in querylist_index:
                try:
                    bag.extend(wordset(row[q]))
                except IndexError:
                    continue
            sample[r_id] = bag
        self.setSample(sample)
Пример #6
0
    def proResult(self, result_raw):
        """
        Merge the raw data and keep them in a dict. Then, pre-process the raw data for similarity join.

        :param result_raw: the raw result returned by api.
        :return: a list for similarity join. [(['yong', 'jun', 'he', 'simon', 'fraser'],'uniqueid')]
        """
        uniqueid = self.__uniqueId.split('.')
        matchlist = []
        for m in self.__matchList:
            matchlist.append(m.split('.'))

        result_merge = self.__mergeResult
        result_er = []
        for row in result_raw:
            r_id = getElement(uniqueid, row)
            if r_id not in result_merge:
                result_merge[r_id] = row
                bag = []
                for m in matchlist:
                    bag.extend(wordset(getElement(m, row)))
                result_er.append((bag, r_id))
        self.setMergeResult(result_merge)
        return result_er
Пример #7
0
    def read_pickle(self):
        """
        Load sample data and then generate a same data structures as localdata_query used for smart crawl.

        **sample** Split the fields into a list of words defined by querylist of each message.
        Then generate a dict for query pool generation. {'uniqueid':['database'. 'laboratory']}
        """
        with open(self.__samplePath, 'rb') as f:
            sample_raw = pickle.load(f)

        sample = {}
        for row in sample_raw:
            try:
                r_id = eval(self.__uniqueId)
            except KeyError:
                continue
            bag = []
            for v in self.__queryList:
                try:
                    bag.extend(wordset(eval(v)))
                except KeyError:
                    continue
            sample[r_id] = bag
        self.setSample(sample)