示例#1
0
def resource_bowtie2_log(uri, **kwargs):
    with open(uri) as fh:
        data = "".join(fh)

    index = ['Number of Reads', 'Number Unpaired',
             'Number Unaligned', 'Number Uniquely Aligned',
             'Number Ambiguously Aligned']

    values = []
    for row in data.strip().split('\n'):
        if not row.startswith('Warn'):
            values.append(re.sub(r' *(\d+)[ %]+.+', r'\1', row))

    df = DataFrame([int(x) for x in values[:-1]], index=index)
    df.index.name = 'statistic'
    df.columns = ['counts']
    return df
示例#2
0
def resource_cutadapt_metrics(uri, **kwargs):
    with open(uri) as fh:
        data = "".join(fh)
    sections = re.split("\n===.*===\n", data)
    df = DataFrame.from_records([_split_x(x) for x in sections[1].split("\n") if x],
                                index=["statistic"], columns=["statistic", "value"])
    df["value"] = pd.to_numeric(df["value"])
    return df
示例#3
0
def _reader(uri):
    with open(uri) as fh:
        data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""]
        indices = list((i for i, val in enumerate(data)
                        if val[0].startswith("## METRICS CLASS")))
        metrics = DataFrame.from_records(data[(indices[0] + 2):],
                                         columns=data[(indices[0] + 1)],
                                         index="CATEGORY")
    return (metrics, None)
示例#4
0
def _hist_reader(uri):
    with open(uri) as fh:
        data = [x.strip("\n").split("\t") for x in fh if not x.strip() == ""]
        indices = list((i for i, val in enumerate(data)
                        if val[0].startswith("## METRICS CLASS")
                        or val[0].startswith("## HISTOGRAM")))
        if len(indices) == 1:
            indices.append(len(data))
        metrics = DataFrame.from_records(data[(indices[0] + 2):(indices[1])],
                                         columns=data[(indices[0] + 1)])
        # We could be missing the histogram
        try:
            hist = DataFrame.from_records(data[(indices[1] + 2):],
                                          columns=data[(indices[1] + 1)])
        except:
            logger.warn("No histogram data for {}".format(uri))
            hist = None
    return (metrics, hist)
示例#5
0
def resource_genome_results(uri, key="Globals", **kwargs):
    with open(uri) as fh:
        data = "".join(fh)
    sections = re.split(">+\s+[a-zA-Z ]+", data)
    section_names = ["Header"] + [re.sub(" ", "_", x) for x in re.findall(">+\s+([a-zA-Z ]+)", data)]
    d = dict()
    for h, sec in zip(section_names, sections):
        if h == "Coverage_per_contig":
            d[h] = DataFrame.from_records([re.split("\s+", x.strip()) for x in sec.split("\n") if x],
                                          columns=COVERAGE_PER_CONTIG_COLUMNS,
                                          index="chr")
            d[h] = d[h].apply(pd.to_numeric)
        elif h in ["Coverage", "Header"]:
            pass
        else:
            d[h] = DataFrame.from_records([_split_x(x) for x in sec.split("\n") if x],
                                          columns=["statistic", "value"],
                                          index="statistic")
            if not h in ["Input"]:
                d[h] = d[h].apply(pd.to_numeric)
    return d[key]
示例#6
0
# In[2]:

client = MongoClient()
db = client.nytimes3


# In[3]:

total = db.articles.count()
percent = math.ceil(total / 100)
count = 0
print('Total documents:', total)
print()

total_rows_df = DataFrame()
try:
    for doc in db.articles.find():
        rows_df = DataFrame()
        try:

            # Texts
            common_texts = []
            if doc.get('abstract'):
                common_texts.append(doc['abstract'])
            if doc.get('headline') and isinstance(doc['headline'], dict) and doc['headline'].get('main'):
                common_texts.append(doc['headline']['main'])
            if doc.get('lead_paragraph'):
                common_texts.append(doc['lead_paragraph'])
            # add snippet as variable field