#!/usr/bin/python # -*- coding: utf-8 -*- import sample import sys if __name__ == "__main__": sentences = sample.get_summaries(sys.argv[1]) outFH = open(sys.argv[2], "w") for sentence in sentences: outFH.write(sentence[2]) outFH.close()
'zwei-glorreiche-halunken.html' """ regex = re.compile(ur'^(.*\.html).*$') return regex.match(filename).group(1) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Extracts movie summarys' + ' and movie ratings') parser.add_argument('--input', required=True, help='input directory') parser.add_argument('--output', required=True, help='output file name for CSV file') parser.add_argument('--ratings-map', required=True, help='Map from file name to rating') args = parser.parse_args() sentences = sample.get_summaries(args.input) outFH = open(args.output, "w") writer = csv.writer(outFH) tmp = OrderedDict() m = load_ratings_map(args.ratings_map) for s in sentences: curFile = clean_filename(s[0]) if not curFile in tmp: tmp[curFile] = [] tmp[curFile].append(s[2].replace("\n", " ")) notFound = 0 invalid = 0 for (fileName, summaryFragments) in tmp.iteritems(): if not fileName in m: notFound += 1 continue