示例#1
0
 def run(self):
     logger = myLogger.myLogger("parsing_thread_log");
     linkedUrls = myLogger.myLogger("linked_urls_log", format='data');
     
     parser = myParser(myurl = self.site.url, domain = self.site.domain);
     parser.parse(self.html);
     try:
         ## run the parser on the html from this page.##
         pass;
     except Exception, e:
         logger.error('urlReader line 188:'+str(e));
         self.myDBConn.cursor.execute("UPDATE sites_sitequeue SET status=400, crawled=1 WHERE id = %d;" %doc_id);
         return;
示例#2
0
    def __init__(self, token, dictionary, output, mode):
        logger = myLogger.myLogger("Input layer initializer")
        logger.info("Initializing input raw")
        self.input = map(
            lambda call: map(
                lambda sentence: map(lambda x: dictionary.token2id[x], sentence
                                     ), call), token)
        self.output, self.act_dict, self.slot_dict = label_dict(output)
        self.sentence_length = 0
        self.sentence_count = 0
        for session in self.input:
            self.sentence_count = max(self.sentence_count, len(session))
            for sentence in session:
                self.sentence_length = max(self.sentence_length, len(sentence))

        # 初始化ndarray
        self.input_mtr = np.zeros(
            (len(self.input), self.sentence_count, self.sentence_length))
        self.output_mtr = np.zeros(
            (len(self.input), self.sentence_count, len(self.act_dict)))
        for session_index in range(0, len(self.input)):
            for sentence_index in range(0, len(self.input[session_index])):
                # 此处仅记录act的label
                for n in range(0,
                               len(self.input[session_index][sentence_index])):
                    self.input_mtr[session_index][sentence_index][
                        n] = self.input[session_index][sentence_index][n]
                if mode == 1:
                    for n in self.output[session_index]["act"][sentence_index]:
                        self.output_mtr[session_index][sentence_index][n] = 1
                elif mode == 2:
                    for n in self.output[session_index]["slot"][
                            sentence_index]:
                        self.output_mtr[session_index][sentence_index][n] = 1
示例#3
0
def dictionary_initializer(token):
    """
    Build dictionary with token
    :param token:
    :return:
    """
    logger = myLogger.myLogger("Dictionary initializer")
    logger.info("Starting building dictionary")
    raw = map(lambda element: reduce(lambda x, y: x + y, element), token)
    dictionary = corpora.Dictionary(raw)
    logger.info("Finish building dictionary")
    return dictionary
示例#4
0
def token_initializer(data):
    """
    Translate text from input into token
    :param data:
    :return:
    """
    logger = myLogger.myLogger("Token initializer")
    logger.info("Starting tokenizing")
    token = map(
        lambda element: map(lambda x: nltk.word_tokenize(x.lower()), element),
        data)
    logger.info("Tokenizing finished")
    return token
示例#5
0
def raw_initializer(dataset):
    """
    Read input and output information from json object
    :param dataset:
    :return:
    """
    logger = myLogger.myLogger("initializer")
    logger.info("Starting raw initializing")
    input_raw = []
    output_raw = []
    for call in dataset:
        input_row = []
        output_row = []
        # if call.log["session-id"] == "voip-f32f2cfdae-130328_192703":
        for turn, label in call:
            input_row.append(turn["output"]["transcript"].lower())
            output_row.append(turn["output"]["dialog-acts"])
            input_row.append(label["transcription"].lower())
            output_row.append(label["semantics"]["json"])
        input_raw.append(input_row)
        output_raw.append(output_row)
    logger.info("Finish raw initializing")
    print(len(input_raw))
    return {"input": input_raw, "output": output_raw}
示例#6
0
文件: corpus.py 项目: hupidong/DSTC-1
def input_tokenize(sentence):
    logger = myLogger.myLogger("tokenize test")
    tokens = nltk.word_tokenize(sentence)
    dictionary = corpora.Dictionary(tokens)
    print dictionary.token2id
    logger.info("dictionary test finished")
示例#7
0
import myLogger;
import Configuration;
import datetime;
from databaseConn import databaseConn;
#### GLOBAL DATA######

myDBConn = databaseConn();

#### MAIN METHOD ####
if __name__ == '__main__':
    
    _oLog = myLogger.myLogger(moduleName="url_assign_log");
    _oLog.info("Started parsing out new URLs");
    
    now = datetime.datetime.now();
    date = now.strftime('%Y%m%d');
    if(Configuration.getSetting('override_date',  '') != ''):
        date = Configuration.getSetting('override_date',  date);
    try:
        datetime.strptime(date, "%Y%m%d");
    except:
        _oLog.error("Parsing date is not a valid date. Must be in format: yyyyMMdd");
    _oLog.info("Running URL Assign on date: " + str(date));
    file = Configuration.getSetting("url_handoff", "C:\\UUMMUU_Code\\Logs\\"+str(date)[0:6]+"\\LINKED_URLS_LOG\\linked_urls_log_"+str(date)+'.log');
    
    try:
        opened_file = open(file);
    except:
        _oLog.info("File is not ready to be parsed. Exiting...");
        exit();
    
示例#8
0
 '''
     NOTE: custom status code: 501 for UUMMUU index is an unsupported file type.
 '''
 myDBConn = databaseConn();
 result = myDBConn.getDomainCount(sys.argv);
 try:
     crawl_domain = result[0][0];
     counter = int(result[0][1]);
 except:
     logger.error('there was an error getting any site domains to crawl');
 
 #get a list of the links we are to ignore for this domain
 dont_follow_links = find_RobotsTxt(crawl_domain);
 
 #start a logger to keep track of what breaks.
 logger = myLogger.myLogger(moduleName="url_reader_log");
 
 #make sure we do not crawl too many pages for a single domain.
 while pageCounter < limit:
     #an index of all the words.
     index = {};
     total_unique_words = [];
     repository = {};
     ## a list of the links on this page ##
     link_list = [];
     ###set up the variables that will be put in the database. USE THESE IN THE SQL STATEMENT
     site = Site();
     
     try:
         if(crawl_domain != ''):
             sql = "SELECT url, id, domain FROM sites_sitequeue WHERE crawled != 1 AND status = 200 AND domain = '%s' ORDER BY id LIMIT 1;" %(crawl_domain);