def extractFeaturesAndLabels(inputFolder): dialogueCorpus = taTool.get_data(inputFolder) dialogCorpusFeature = [] dialogCorpusLabel = [] for dialogSet in dialogueCorpus: dialogSetFeature = [] dialogSetLabel = [] previousSpeaker = None currentSpeaker = None for index,dialog in enumerate(dialogSet): actTag = dialog.act_tag currentSpeaker = dialog.speaker # numContinueSpeak = 0 # while (index + numContinueSpeak + 1) < len(dialogSet) and dialogSet[index + numContinueSpeak + 1].speaker == currentSpeaker: # numContinueSpeak += 1 # nextConversationLength = 0 # if (index + 1) < len(dialogSet): # if dialogSet[index+1].pos: # nextConversationLength = len(dialogSet[index+1].pos) # feature = generateFeatures(dialog, (currentSpeaker == previousSpeaker), index == 0,index,numContinueSpeak,nextConversationLength,index == (len(dialogSet)-1)) # feature = generateFeatures(dialog, (currentSpeaker == previousSpeaker), index == 0) previousSpeaker = currentSpeaker dialogSetFeature = dialogSetFeature + [feature] dialogSetLabel = dialogSetLabel + [actTag] dialogCorpusFeature = dialogCorpusFeature + [dialogSetFeature] dialogCorpusLabel = dialogCorpusLabel + [dialogSetLabel] return(dialogCorpusFeature,dialogCorpusLabel)
def test_data(args): data = tool.get_data(args[1]) tagger = pycrfsuite.Tagger() tagger.open('advanced') features = create_features(data) output = tagger.tag(features['feature']) return {'label': output, 'feature': features}
def crftrainer(self, directry, data_model): conversations = tool.get_data(directry) for f_name, conv in conversations: value_x, value_y = self.buildmodel(conv) self.crf_feature_train.append(value_x, value_y) self.crf_feature_train.train(data_model)
def tag_dir(self, test_dir): self.tagger.open('sequence_label_model.crfsuite') test_data = get_data(test_dir) for dialogue in test_data: utterances = dialogue[1] features, act_tags = self.get_features_act_tags(utterances) self.tag_data[dialogue[0]].extend(self.tagger.tag(features))
def extractFeaturesAndLabels(inputFolder): dialogueCorpus = taTool.get_data(inputFolder) dialogCorpusFeature = [] dialogCorpusLabel = [] for dialogSet in dialogueCorpus: dialogSetFeature = [] dialogSetLabel = [] previousSpeaker = None currentSpeaker = None for index, dialog in enumerate(dialogSet): actTag = dialog.act_tag currentSpeaker = dialog.speaker feature = generateFeatures(dialog, (currentSpeaker == previousSpeaker), index == 0) previousSpeaker = currentSpeaker dialogSetFeature = dialogSetFeature + [feature] dialogSetLabel = dialogSetLabel + [actTag] dialogCorpusFeature = dialogCorpusFeature + [dialogSetFeature] dialogCorpusLabel = dialogCorpusLabel + [dialogSetLabel] return (dialogCorpusFeature, dialogCorpusLabel)
def test_data(args): data = tool.get_data(args[1]) tagger = pycrfsuite.Tagger() tagger.open('baselinecrf') features = create_features(data) output = tagger.tag(features['feature']) print(len(output)) return {'label': output, 'feature': features}
def read_data(args): data = tool.get_data(args[0]) features = create_features(data) return { 'xtrain': features['feature'], 'ytrain': features['label'], 'file': features['file'], 'length': features['length'] }
from pprint import pprint __author__ = "Shurui Liu" __email__ = "*****@*****.**" # timer start = timeit.default_timer() # inputdir, testdir, and outputfile # python3 baseline_crf.py 'testdata/inputdir' 'testdata/testdir' 'baseline_output.txt' inputdir = sys.argv[1] testdir = sys.argv[2] outputfile = sys.argv[3] # all the csv files, data type is generator train_file = hw3_corpus_tool.get_data(inputdir) test_file = hw3_corpus_tool.get_data(testdir) # a list of all the files in inputdir and testdir train_list = list(train_file) test_list = list(test_file) # x_train, y_train x_train = [] y_train = [] # for all the files in the list for file in train_list: for line in range(len(file) - 1): line_feature = [] #act_tag act_tag = file[line][0]
def scan_input_dir(self, input_dir): train_data = get_data(input_dir) for dialogue in train_data: features, act_tags = self.get_features_act_tags(dialogue[1]) self.trainer.append(features, act_tags)
def crfpred(self, datafolder, output): conversations = tool.get_data(datafolder) self.writetofile(conversations, output)
# create a list output_tag = [] # read lines from output.txt with open(textfile) as f: for line in f: line = line.strip() if not line: # line is blank continue if line.startswith("Filename"): # comment line continue output_tag.append(line) print(len(output_tag)) # all the csv files, data type is generator dev_file = hw3_corpus_tool.get_data(devdir) dev_list = list(dev_file) dev_tag = [] for file in dev_list: for line in range(len(file) - 1): #act_tag act_tag = file[line][0] dev_tag.append(act_tag) print(len(dev_tag)) # total number of tags, correct tags number total = len(dev_tag) correct = 0
from hw3_corpus_tool import get_data import sys def check_structure(arr): len_arr = len(arr) for i in range(len_arr): try: if len(arr[i]): check_structure(arr[i]) except: print "no furthur decomposition" print arr[i] path = sys.argv[1] print 'path', path doc = get_data(path) for i in doc: print i