예제 #1
0
def dataSourceToNLP():
  df = pd.read_csv(ROOT + '/'+inFilename)
  sizeStr = str(len(df.index))
  # try Vectorization later
  for index, row in df.iterrows():
    text = cleanText(row['content'])
    doc = nlpc.textToDoc(text)
    print(str(index) + '/' + sizeStr)
    # nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(doc, 4)
    # nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, maxDepths)
    treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH)
    io.mapListToCsv(ROOT + '/results/' + outFilename, treeDicts, 'a', writeheader=True)
예제 #2
0
def dataSourceUnevenHeaderToNLP_v2(stopAt=100):
  df = pd.read_csv(ROOT + '/'+inFilename)
  sizeStr = str(len(df.index))
  treeDicts = []
  # try Vectorization later
  for index, row in df.iterrows():
    if(index > stopAt):
        break
    text = cleanText(row['content'])
    doc = nlpc.textToDoc(text)
    print(str(index) + '/' + sizeStr)
    # nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(doc, 4)
    # nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, maxDepths)
    tempTreeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH)
    treeDicts.extend(tempTreeDicts)
  io.mapListToCsv(ROOT + '/results/' + outFilename_write, treeDicts, 'w', writeheader=True)
예제 #3
0
def run():
    inFilename = inputFolder + '/' + project + '.txt'
    outProjectFolder = outputFolder + '/' + project

    text = io.textFileToString(inFilename)
    doc = nlpc.textToDoc(text)

    # nlpDicts = []
    # sentNlpMapList = nlpc.textToNLPDictsList(text)
    # nlpDicts += sentNlpMapList
    # io.mapListToCsv(outProjectFolder + '/nlp_dicts.csv', nlpDicts)

    # nlpSeqs = nlpc.docToNLPSequenceList(text)
    # io.mapListToCsv(outProjectFolder + '/nlp_seqs.csv', nlpSeqs)
    # nlpSeqs = nlpc.textToSimplifiedNLPSequenceList(text)
    # io.mapListToCsv(outProjectFolder + '/nlp_seqs_sim.csv', nlpSeqs)

    # for i in range(2,6):
    #   nlpSeqs = nlpc.docToMaxDepthTreeNLPSequenceList(text, i)
    #   io.mapListToCsv(outProjectFolder + '/nlp_seqs_tree_d'+str(i)+'.csv', nlpSeqs)

    nlpSeqs = nlpc.docToMaxDepthsTreeNLPSequenceList(doc, range(3, 7))
    io.mapListToCsv(outProjectFolder + '/nlp_seqs_tree_depths.csv', nlpSeqs)
예제 #4
0
def dataSourceUnevenHeaderToNLP(headerSampleCount=10, writeHeader=True):
  df = pd.read_csv(ROOT + '/'+inFilename)
  sizeStr = str(len(df.index))
  headerSampleMaplist = []
  # get header from sampling
  # try Vectorization later
  if writeHeader:
    for index, row in df.iterrows():
      if(index > headerSampleCount):
        break
      text = cleanText(row['content'])
      doc = nlpc.textToDoc(text)
      print(str(index) + '/' + sizeStr)
      treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH)
      headerSampleMaplist.extend(treeDicts)
    sampledHeader = io.getHeaderFromMapList(headerSampleMaplist)
    # write header
    io.mapListToCsv(ROOT + '/results/' + outFilename, [], 'a', header=sampledHeader)
  for index, row in df.iterrows():
    text = cleanText(row['content'])
    doc = nlpc.textToDoc(text)
    print(str(index) + '/' + sizeStr)
    treeDicts = nlpc.docToParseTreeDictList(doc, MAX_CHILD, MAX_DEPTH)
    io.mapListToCsv(ROOT + '/results/' + outFilename, treeDicts, 'a', header=sampledHeader)
def run():
    inFilename = inputFolder + '/' + project + '.txt'
    outProjectFolder = outputFolder + '/' + project

    text = io.textFileToString(inFilename)

    nlpDicts = []
    sentNlpMapList = nlpc.textToNLPDictsList(text)
    nlpDicts += sentNlpMapList
    io.mapListToCsv(outProjectFolder + '/nlp_dicts.csv', nlpDicts)

    nlpSeqs = nlpc.textToNLPSequenceList(text)
    io.mapListToCsv(outProjectFolder + '/nlp_seqs.csv', nlpSeqs)
    nlpSeqs = nlpc.textToSimplifiedNLPSequenceList(text)
    io.mapListToCsv(outProjectFolder + '/nlp_seqs_sim.csv', nlpSeqs)
예제 #6
0
import sys
import io_local.io as io
import nlp.nlp_controller as nlpc

inputFolder = 'C:/Users/John/Documents/pgi_dev/NLP_local_storage/inputs'
outputFolder = 'C:/Users/John/Documents/pgi_dev/NLP_local_storage/outputs'

print(sys.argv)
project = sys.argv[1]

inFilename = inputFolder + '/' + project + '.csv'
outFilename = outputFolder + '/' + project + '.csv'

res = []
textArrList = io.csvToList(project)
print(textArrList)
for textArr in textArrList:
    text = textArr[0]
    sentNlpMapList = nlpc.textToNLPDictsList(text)
    res += sentNlpMapList

io.mapListToCsv(project, res)