示例#1
0
def metaDataExtractor(groupedUtterances, markers, corpusType=''):
    results = []
    for i, convo in enumerate(groupedUtterances):
        if (i % 2500 is 10):
            logger1.log("On " + str(i) + " of " + str(len(groupedUtterances)))

        toAppend = findMarkersInConvo(markers, convo)
        toAppend = addFeats(toAppend, convo[0], True, corpusType)
        results.append(toAppend)
    return results
示例#2
0
def metaDataExtractor(groupedUtterances, markers,corpusType=''):
	results = []
	for i, convo in enumerate(groupedUtterances):
		if(i % 2500 is 10):
			logger1.log("On " + str(i) + " of " + str(len(groupedUtterances)))
				
		toAppend = findMarkersInConvo(markers,convo)		
		toAppend = addFeats(toAppend,convo[0],True,corpusType)
		results.append(toAppend)
	return results
示例#3
0
def runFormula(results, markers, smoothing,corpusType):
	toReturn = []
	categories = allMarkers(markers)
	for i, result in enumerate(results):
		if(i % 1000 is 10):
			logger1.log("On result " + str(i) + " of " + str(len(results)))
		for j, category in enumerate(categories):
			toAppend = createAlignmentDict(category,result,smoothing,corpusType)
			if toAppend is not None:
				toReturn.append(toAppend)
	toReturn = sorted(toReturn, key=lambda k: (k["speakerId"],k["replierId"],k["category"]))
	return toReturn
示例#4
0
def runFormula(results, markers, smoothing, corpusType):
    toReturn = []
    categories = allMarkers(markers)
    for i, result in enumerate(results):
        if (i % 1000 is 10):
            logger1.log("On result " + str(i) + " of " + str(len(results)))
        for j, category in enumerate(categories):
            toAppend = createAlignmentDict(category, result, smoothing,
                                           corpusType)
            if toAppend is not None:
                toReturn.append(toAppend)
    toReturn = sorted(toReturn,
                      key=lambda k:
                      (k["speakerId"], k["replierId"], k["category"]))
    return toReturn
示例#5
0
def writeFile(results, outputFile, shouldWriteHeader):
    if len(results) == 0:
        logger1.log("No results to write =(")
        return
    toWrite = []
    header = sorted(list(results[0].keys()))
    for row in results:
        toAppend = []
        for key in header:
            toAppend.append(row[key])
        toWrite.append(toAppend)
    if shouldWriteHeader:
        with open(outputFile, "w", newline='') as f:
            writer = csv.writer(f)
            writer.writerows([header])
        f.close()
    with open(outputFile, "a", newline='') as f:
        writer = csv.writer(f)
        writer.writerows(toWrite)
    f.close()
示例#6
0
def writeFile(results, outputFile, shouldWriteHeader):
	if len(results) == 0:
		logger1.log("No results to write =(")
		return
	toWrite = []
	header = sorted(list(results[0].keys()))
	for row in results:
		toAppend = []
		for key in header:
			toAppend.append(row[key])
		toWrite.append(toAppend)
	if shouldWriteHeader:
		with open(outputFile, "w", newline='') as f:
			writer = csv.writer(f)
			writer.writerows([header])
		f.close()
	with open(outputFile, "a", newline='') as f:
		writer = csv.writer(f)
		writer.writerows(toWrite)
	f.close()
示例#7
0
def shuffleUtterances(utterances, shuffleIds, shuffleTweets, shuffleTokens,
                      combineMsgReply):
    replyUserIds = []
    msgUserIds = []
    replyTweets = []
    msgTweets = []
    allReplyTokens = []
    allMsgTokens = []
    replyLengths = []
    msgLengths = []
    for i, utterance in enumerate(utterances):
        if (i % 10000 is 0):
            logger1.log("Adding to utterances " + str(i) + " of " +
                        str(len(utterances)))
        msgUserIds.append(utterance["msgUserId"])
        msgTweets.append(utterance["msgTokens"])
        allMsgTokens.extend(utterance["msgTokens"])
        msgLengths.append(len(utterance["msgTokens"]))
        if not combineMsgReply:  #if we're shuffling msgs and replies together, put everything in msgs
            replyUserIds.append(utterance["replyUserId"])
            replyTweets.append(utterance["replyTokens"])
            allReplyTokens.extend(utterance["replyTokens"])
            replyLengths.append(len(utterance["replyTokens"]))
        else:
            msgUserIds.append(utterance["replyUserId"])
            msgTweets.append(utterance["replyTokens"])
            allMsgTokens.extend(utterance["replyTokens"])
            msgLengths.append(len(utterance["replyTokens"]))

    shuffle(msgUserIds)
    shuffle(msgTweets)
    shuffle(allMsgTokens)
    if not combineMsgReply:
        shuffle(replyUserIds)
        shuffle(replyTweets)
        shuffle(allReplyTokens)
    else:
        shuffle(msgLengths
                )  #only shuffle msgLengths if we're combining msgs and replies

    replyMarkerCount = 0
    msgMarkerCount = 0

    msgLengthsNew = []
    replyLengthsNew = []

    for i, utterance in enumerate(utterances):
        utterances[i]["msg"] = ""
        utterances[i]["reply"] = ""

        if (shuffleIds):
            if not combineMsgReply:
                utterances[i]["msgUserId"] = msgUserIds[i]
                utterances[i]["replyUserId"] = replyUserIds[i]
            else:
                utterances[i]["msgUserId"] = msgUserIds[2 * i]
                utterances[i]["replyUserId"] = msgUserIds[2 * i + 1]

        if (shuffleTweets):
            if not combineMsgReply:
                utterances[i]["msgTokens"] = msgTweets[i]
                utterances[i]["replyTokens"] = replyTweets[i]
            else:
                utterances[i]["msgTokens"] = msgTweets[2 * i]
                utterances[i]["replyTokens"] = msgTweets[2 * i + 1]

        if (shuffleTokens):
            if not combineMsgReply:
                utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(
                    msgMarkerCount + msgLengths[i])]
                msgMarkerCount += msgLengths[i]
                utterances[i]["replyTokens"] = allReplyTokens[
                    replyMarkerCount:(replyMarkerCount + replyLengths[i])]
                replyMarkerCount += replyLengths[i]
            else:
                utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(
                    msgMarkerCount + msgLengths[2 * i])]
                msgMarkerCount += msgLengths[2 * i]
                msgLengthsNew.append(msgLengths[2 * i])
                utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(
                    msgMarkerCount + msgLengths[2 * i + 1])]
                msgMarkerCount += msgLengths[2 * i + 1]
                replyLengthsNew.append(msgLengths[2 * i + 1])
        utterances[i]["convId"] = (utterances[i]["msgUserId"],
                                   utterances[i]["replyUserId"])
    return utterances
示例#8
0
                msgMarkerCount += msgLengths[2 * i]
                msgLengthsNew.append(msgLengths[2 * i])
                utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(
                    msgMarkerCount + msgLengths[2 * i + 1])]
                msgMarkerCount += msgLengths[2 * i + 1]
                replyLengthsNew.append(msgLengths[2 * i + 1])
        utterances[i]["convId"] = (utterances[i]["msgUserId"],
                                   utterances[i]["replyUserId"])
    return utterances


#Core calls
start = logger1.initialize()

#Reading in user info and tweets
logger1.log("Reading user info...")
users = readUserInfo()
logger1.log("Reading messages...")
result = readCSV(inputFile, users, numMarkers)
rows = result["rows"]
markers = result["markers"]

#Shuffling tweets if any shuffling has been requested
if (someShuffling):
    logger1.log(rows[0])
    rows = shuffleUtterances(rows, shuffleIds, shuffleTweets, shuffleMarkers,
                             combineMsgReply)
    logger1.log(rows[0])

#Adding user info & extracting markers from messages
utterances = transformCSV(markers, users, rows)
示例#9
0
文件: twtr.py 项目: langcog/alignment
def shuffleUtterances(utterances, shuffleIds, shuffleTweets, shuffleTokens, combineMsgReply):
	replyUserIds = []
	msgUserIds = []
	replyTweets = []
	msgTweets = []
	allReplyTokens = []
	allMsgTokens = []
	replyLengths = []
	msgLengths = []
	for i, utterance in enumerate(utterances):
		if(i % 10000 is 0):
			logger1.log("Adding to utterances " + str(i) + " of " + str(len(utterances)))
		msgUserIds.append(utterance["msgUserId"])
		msgTweets.append(utterance["msgTokens"])
		allMsgTokens.extend(utterance["msgTokens"])
		msgLengths.append(len(utterance["msgTokens"]))
		if not combineMsgReply:						#if we're shuffling msgs and replies together, put everything in msgs
			replyUserIds.append(utterance["replyUserId"])
			replyTweets.append(utterance["replyTokens"])
			allReplyTokens.extend(utterance["replyTokens"])
			replyLengths.append(len(utterance["replyTokens"]))
		else:
			msgUserIds.append(utterance["replyUserId"])
			msgTweets.append(utterance["replyTokens"])			
			allMsgTokens.extend(utterance["replyTokens"])
			msgLengths.append(len(utterance["replyTokens"]))
	
	shuffle(msgUserIds); shuffle(msgTweets); shuffle(allMsgTokens)
	if not combineMsgReply: 
		shuffle(replyUserIds); shuffle(replyTweets); shuffle(allReplyTokens)
	else:
		shuffle(msgLengths)		#only shuffle msgLengths if we're combining msgs and replies
	
	replyMarkerCount = 0
	msgMarkerCount = 0
	
	msgLengthsNew = []
	replyLengthsNew = []

	for i, utterance in enumerate(utterances):		
		utterances[i]["msg"] = ""
		utterances[i]["reply"] = ""

		if(shuffleIds):
			if not combineMsgReply:
				utterances[i]["msgUserId"] = msgUserIds[i]
				utterances[i]["replyUserId"] = replyUserIds[i] 
			else:
				utterances[i]["msgUserId"] = msgUserIds[2*i]
				utterances[i]["replyUserId"] = msgUserIds[2*i+1] 
				
		
		if(shuffleTweets):
			if not combineMsgReply:
				utterances[i]["msgTokens"] = msgTweets[i]
				utterances[i]["replyTokens"] = replyTweets[i]
			else:
				utterances[i]["msgTokens"] = msgTweets[2*i]
				utterances[i]["replyTokens"] = msgTweets[2*i+1]
		
		if(shuffleTokens):
			if not combineMsgReply:
				utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[i])]
				msgMarkerCount += msgLengths[i]
				utterances[i]["replyTokens"] = allReplyTokens[replyMarkerCount:(replyMarkerCount+replyLengths[i])]
				replyMarkerCount += replyLengths[i]
			else:
				utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i])]
				msgMarkerCount += msgLengths[2*i]
				msgLengthsNew.append(msgLengths[2*i])
				utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i+1])]
				msgMarkerCount += msgLengths[2*i+1]
				replyLengthsNew.append(msgLengths[2*i+1])
		utterances[i]["convId"] = (utterances[i]["msgUserId"],utterances[i]["replyUserId"])
	return utterances
示例#10
0
文件: twtr.py 项目: langcog/alignment
			else:
				utterances[i]["msgTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i])]
				msgMarkerCount += msgLengths[2*i]
				msgLengthsNew.append(msgLengths[2*i])
				utterances[i]["replyTokens"] = allMsgTokens[msgMarkerCount:(msgMarkerCount+msgLengths[2*i+1])]
				msgMarkerCount += msgLengths[2*i+1]
				replyLengthsNew.append(msgLengths[2*i+1])
		utterances[i]["convId"] = (utterances[i]["msgUserId"],utterances[i]["replyUserId"])
	return utterances


#Core calls
start = logger1.initialize()

#Reading in user info and tweets
logger1.log("Reading user info...")
users = readUserInfo()
logger1.log("Reading messages...")
result = readCSV(inputFile, users, numMarkers)
rows = result["rows"]
markers = result["markers"]

#Shuffling tweets if any shuffling has been requested
if(someShuffling):
	logger1.log(rows[0])
	rows = shuffleUtterances(rows, shuffleIds, shuffleTweets, shuffleMarkers, combineMsgReply)
	logger1.log(rows[0])

#Adding user info & extracting markers from messages
utterances = transformCSV(markers, users, rows)