def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap): p = DocumentParser() f = open("raw_data/fulltext-corpus.json", "w") f.write("{" + "relation-name\":\"full-text-corpus\"," + "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n") for filepath in filepaths: doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n") f.flush() f.close()
def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap): p = DocumentParser() f = open("raw_data/fulltext-corpus.json", "w") f.write("{" + "relation-name\":\"full-text-corpus\"," + "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n") for filepath in filepaths: doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n") f.flush() f.close()
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir): p = DocumentParser() count = 0 totalDocs = len(filenameFilepathsPairs) for filename, filepath in filenameFilepathsPairs: doc = p.parse(filepath) print str(count) + " / " + str(totalDocs) if "zbmath metadata" in doc.includedSources: dataMap = doc.toDataMap(tokens2IndexMap) f = open(path.join(targetDir, filename + ".json"), "w") f.write(json.dumps(dataMap)) f.close() count += 1
def buildWordCountDict(filepaths): p = DocumentParser() wordCounts = dict() count = 0 total = len(filepaths) for filepath in filepaths: print str(count) + "/" + str(total) doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: for token in doc.tokens: if token not in wordCounts: wordCounts[token] = 0 wordCounts[token] = wordCounts[token] + 1 count += 1 return wordCounts
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir): p = DocumentParser() count = 0 totalDocs = len(filenameFilepathsPairs) for filename, filepath in filenameFilepathsPairs: doc = p.parse(filepath) print str(count) + " / " + str(totalDocs) if "zbmath metadata" in doc.includedSources: dataMap = doc.toDataMap(tokens2IndexMap) f = open(path.join(targetDir, filename + ".json"), "w") f.write(json.dumps(dataMap)) f.close() count += 1
def buildWordCountDict(filepaths): p = DocumentParser() wordCounts = dict() count = 0 total = len(filepaths) for filepath in filepaths: print str(count) + "/" + str(total) doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: for token in doc.tokens: if token not in wordCounts: wordCounts[token] = 0 wordCounts[token] = wordCounts[token] + 1 count += 1 return wordCounts